blob: b6ec44bb161e791969e764bb06bd82712ffc74ee [file]
// Copyright 2020 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// NOTE: the best kind of synchronization is no synchronization; always try to
// design your algorithm so that you don't need anything from this file :)
// See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html
#ifndef IREE_BASE_INTERNAL_SYNCHRONIZATION_H_
#define IREE_BASE_INTERNAL_SYNCHRONIZATION_H_
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "iree/base/api.h"
#include "iree/base/internal/atomics.h"
#include "iree/base/target_platform.h"
#include "iree/base/tracing.h"
// NOTE: clang cannot support thread annotations in C code due to some
// representational bugs... which means that we can't use it here. Boo.
// There's some workarounds I've seen but getting TSAN working would be much
// easier as a starting point.
#if 0 // defined(IREE_COMPILER_CLANG)
#define IREE_THREAD_ANNOTATION_ATTRIBUTE(x) __attribute__((x))
#else
#define IREE_THREAD_ANNOTATION_ATTRIBUTE(x)
#endif // IREE_COMPILER_CLANG
#ifdef __cplusplus
// Documents if a shared field or global variable needs to be protected by a
// mutex. IREE_GUARDED_BY() allows the user to specify a particular mutex that
// should be held when accessing the annotated variable.
#define IREE_GUARDED_BY(x) IREE_THREAD_ANNOTATION_ATTRIBUTE(guarded_by(x))
#else
#define IREE_GUARDED_BY(x)
#endif // __cplusplus
#ifdef __cplusplus
// Like IREE_GUARDED_BY but specifies that the contents of a pointer are guarded
// by a mutex instead of the pointer itself.
#define IREE_PTR_GUARDED_BY(x) \
IREE_THREAD_ANNOTATION_ATTRIBUTE(pt_guarded_by(x))
#else
#define IREE_PTR_GUARDED_BY(x)
#endif // __cplusplus
// Allow users to fully disable all synchronization for systems that are known
// to never need it. This removes our dependency on pthreads.
#if !IREE_SYNCHRONIZATION_DISABLE_UNSAFE
#if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_EMSCRIPTEN) || \
defined(IREE_PLATFORM_LINUX) || defined(IREE_PLATFORM_WINDOWS)
#define IREE_PLATFORM_HAS_FUTEX 1
#endif // IREE_PLATFORM_*
#if defined(IREE_PLATFORM_APPLE)
#include <os/lock.h>
#endif // IREE_PLATFORM_APPLE
#if !defined(IREE_PLATFORM_WINDOWS)
#include <pthread.h>
#endif // !IREE_PLATFORM_WINDOWS
// We have the CRITICAL_SECTION path for now but Slim Reader/Writer lock (SRW)
// is much better (and what std::mutex uses). SRW doesn't spin, though, and has
// some other implications that don't quite line up with pthread_mutex_t on most
// platforms. Once we have larger end-to-end benchmarks we should choose based
// on workloads.
#define IREE_MUTEX_USE_WIN32_SRW 1
#endif // !IREE_SYNCHRONIZATION_DISABLE_UNSAFE
#ifdef __cplusplus
extern "C" {
#endif
#define IREE_ALL_WAITERS INT32_MAX
#define IREE_INFINITE_TIMEOUT_MS UINT32_MAX
//==============================================================================
// iree_mutex_t
//==============================================================================
// A normal fat mutex (ala std::mutex).
// This may be implemented as a slim mutex on certain platforms but in the worst
// case will be the native platform primitive (like pthread_mutex_t) and as such
// should not be embedded in structures meant to be kept small.
//
// Windows: Slim Reader/Writer (SRW) Locks
// All others: pthread_mutex_t
typedef struct iree_mutex_t IREE_THREAD_ANNOTATION_ATTRIBUTE(
capability("mutex")) {
#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
int reserved;
#elif defined(IREE_PLATFORM_WINDOWS) && defined(IREE_MUTEX_USE_WIN32_SRW)
SRWLOCK value;
#elif defined(IREE_PLATFORM_WINDOWS)
CRITICAL_SECTION value;
#else
pthread_mutex_t value;
#endif // IREE_PLATFORM_*
#if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_SLOW_LOCKS)
uint32_t lock_id;
#endif // IREE_TRACING_FEATURE_SLOW_LOCKS
} iree_mutex_t;
#if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_SLOW_LOCKS)
// Initializes |out_mutex| to the well-defined unlocked contents.
// Must be called prior to using any other iree_mutex_* method.
#define iree_mutex_initialize(out_mutex) \
static const iree_tracing_location_t TracyConcat( \
__tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \
(uint32_t)__LINE__, 0}; \
iree_mutex_initialize_impl(&TracyConcat(__tracy_source_location, __LINE__), \
out_mutex);
void iree_mutex_initialize_impl(const iree_tracing_location_t* src_loc,
iree_mutex_t* out_mutex);
#else
// Initializes |out_mutex| to the well-defined unlocked contents.
// Must be called prior to using any other iree_mutex_* method.
void iree_mutex_initialize(iree_mutex_t* out_mutex);
#endif // IREE_TRACING_FEATURE_SLOW_LOCKS
// Deinitializes |mutex| (after a prior call to iree_mutex_initialize).
// The mutex must not be held by any thread.
void iree_mutex_deinitialize(iree_mutex_t* mutex)
IREE_THREAD_ANNOTATION_ATTRIBUTE(locks_excluded(mutex));
// Locks the |mutex| and returns when held by the caller.
void iree_mutex_lock(iree_mutex_t* mutex)
IREE_THREAD_ANNOTATION_ATTRIBUTE(acquire_capability(mutex));
// Tries to lock the |mutex| and returns true if the caller holds the lock.
bool iree_mutex_try_lock(iree_mutex_t* mutex)
IREE_THREAD_ANNOTATION_ATTRIBUTE(try_acquire_capability(true, mutex));
// Unlocks the |mutex|, which must be held by the caller.
void iree_mutex_unlock(iree_mutex_t* mutex)
IREE_THREAD_ANNOTATION_ATTRIBUTE(release_capability(mutex));
//==============================================================================
// iree_slim_mutex_t
//==============================================================================
// TODO(benvanik): instrument with tracy; need to capture source location on
// init and add storage for ID.
// A lightweight unfair lock.
// Depending on platform this is significantly smaller than a mutex (4-8 bytes
// vs 64+ bytes), can always be statically initialized/requires no allocations,
// and performs the minimal amount of work possible while still playing nicely
// with the OS thread scheduler.
//
// Unlike a full mutex these don't have the ability to be shared across
// processes (not something we care about), don't have a way to define timeouts,
// and have only a binary held/unheld state. They are often an order of
// magnitude faster in uncontended/lightly-contended code and the same
// performance in highly-contended code, though, so it's worth it for locks that
// be guarding small data structures (queue pointers, etc) and touched from many
// threads. Since they are so lightweight it's possible to embed them per-object
// instead of per-manager and change from a single highly-contended lock to
// thousands of almost completely uncontended slim locks.
//
// Though these locks have lightweight return paths (using atomics ops, see
// the below paragraph, with or without spinning), they always have a
// heavyweight fallback path that ends up calling into the kernel to properly
// let the thread wait. This is critical to avoid pathological cases under
// contention and allowing for thread priority inheritance when there are
// multiple threads competing that may otherwise be scheduled in a potentially
// livelocking order.
//
// The "unfair" here comes from the fact that it's possible on certain platforms
// for certain threads to never be able to acquire the lock in cases of
// extremely high contention or widely disparate thread priority levels. This is
// mitigated by ensuring only very small regions of code are guarded and that
// there's enough work happening outside of the lock on any particular thread to
// ensure that there's some chance of other threads being able to acquire it.
//
// Notes on weakly-ordered atomics (for the lightweight return paths)
// ------------------------------------------------------------------
//
// The lightweight return paths (avoiding waiting in the kernel) are typically
// implemented using acquire/release weakly-ordered atomics. When these code
// paths are taken:
//
// * iree_slim_mutex_lock is a read-modify-write operation on the
// mutex with memory_order_acquire.
// * iree_slim_mutex_unlock is a read-modify-write operation on the
// mutex with memory_order_release.
//
// This means the following guarantee for the caller of iree_slim_mutex_lock:
//
// When iree_slim_mutex_lock returns on this thread T1 from having waited on
// another thread T2 calling iree_slim_mutex_unlock, all memory read and write
// operations performed on thread T2 prior to calling iree_slim_mutex_unlock
// are guaranteed to "happen-before" any subsequent memory read or write on
// thread T1.
//
// This is meant to be an implementation detail within the standard contract
// for anything called a "mutex". The C++ standard is a good reference for what
// it means to be a mutexin the context of the C11/C++11 memory model:
// https://eel.is/c++draft/thread.mutex#requirements.mutex
//
// It is not trivial why the above-described memory orders happen to be
// sufficient to meet these mutex requirements, so here is a FAQ:
//
// Q: Is it really OK with the memory model for mutex lock and unlock to be mere
// atomic operations, and wouldn't they at least need to have sequentially-
// consistent order?
// A: https://eel.is/c++draft/thread.mutex.requirements.mutex.general says:
// > The implementation provides lock and unlock operations, as described
// > below. For purposes of determining the existence of a data race, these
// > behave as atomic operations ([intro.multithread]). The lock and unlock
// > operations on a single mutex appears to occur in a single total order.
// Key here is "on a single mutex". There is no ordering requirement across
// operations on separate mutex objects. That is what sequentially-consistent
// atomics would be needed for. The only ordering requirement is among ops
// "on a single mutex" and that is what one can implement with careful use
// of acquire/release atomics.
//
// Q: Since iree_slim_mutex_{lock,unlock} are read-modify-write operations,
// shouldn't they have at least memory_order_acq_rel so that both the
// read and write aspects of each of them construct happens-before
// relationships with each other?
// A: Separate answer regarting iree_slim_mutex_lock and iree_slim_mutex_unlock:
// * Why iree_slim_mutex_lock only needs acquire order, not release:
// When iree_slim_mutex_lock returns, the calling thread T1 holds the
// lock. A call to iree_slim_mutex_lock on another thread T2 can't return
// until thread T1 calls iree_slim_mutex_unlock, which already has
// memory_order_release.
// * Why iree_slim_mutex_unlock only needs release order, not acquire:
// By requirement of iree_slim_mutex_unlock, the calling thread had
// already called iree_slim_mutex_lock prior to calling
// iree_slim_mutex_unlock. Mutual exclusion implies that no other thread
// can have operated on this mutex object since that iree_slim_mutex_lock
// call on the calling thread.
//
// OS-specific implementation aspects (for the heavyweight return paths)
// ---------------------------------------------------------------------
//
// MacOS/iOS: os_unfair_lock
// Spins and after a short backoff drops to a futex-like behavior of waiting
// in the kernel. Unfortunately real futexes aren't supported.
// See:
// https://developer.apple.com/documentation/os/synchronization
// https://opensource.apple.com/source/libplatform/libplatform-125/src/os/lock.c.auto.html
//
// Emscripten: emscripten_futex_wait/emscripten_futex_wake
// Spins and after a short backoff drops to a futex-like behavior of waiting
// in the kernel.
// See:
// https://github.com/emscripten-core/emscripten/blob/b43474f55aeb49083b9df74fdd0e52ec8decf788/system/include/emscripten/threading.h#L114-L120
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Atomics/wait
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Atomics/notify
//
// Windows: WaitOnAddress/WakeByAddress*
// Spins and after a short backoff drops to a futex and waits in the kernel.
// See:
// https://docs.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-waitonaddress
// https://devblogs.microsoft.com/oldnewthing/20170601-00/?p=96265
//
// Linux/Android/others: futex
// Spins and after a short backoff drops to a futex and waits in the kernel.
// See:
// http://locklessinc.com/articles/futex_cheat_sheet/
// https://man7.org/linux/man-pages/man2/futex.2.html
// https://eli.thegreenplace.net/2018/basics-of-futexes/
// https://bartoszmilewski.com/2008/09/01/thin-lock-vs-futex/
typedef struct iree_slim_mutex_t IREE_THREAD_ANNOTATION_ATTRIBUTE(
capability("mutex")) {
#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
int reserved;
#elif (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS)
iree_mutex_t impl; // re-route to slow mutex
#elif defined(IREE_PLATFORM_APPLE)
os_unfair_lock value;
#elif defined(IREE_PLATFORM_WINDOWS) && defined(IREE_MUTEX_USE_WIN32_SRW)
SRWLOCK value;
#elif defined(IREE_PLATFORM_HAS_FUTEX)
iree_atomic_int32_t value;
#else
iree_mutex_t impl; // fallback
#endif // IREE_PLATFORM_*
} iree_slim_mutex_t;
#if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS)
// Initializes |out_mutex| to the well-defined unlocked contents.
// Must be called prior to using any other iree_slim_mutex_* method.
#define iree_slim_mutex_initialize(out_mutex) \
static const iree_tracing_location_t TracyConcat( \
__tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \
(uint32_t)__LINE__, 0}; \
iree_slim_mutex_initialize_impl( \
&TracyConcat(__tracy_source_location, __LINE__), out_mutex);
void iree_slim_mutex_initialize_impl(const iree_tracing_location_t* src_loc,
iree_slim_mutex_t* out_mutex);
#else
// Initializes |out_mutex| to the well-defined unlocked contents.
// Must be called prior to using any other iree_slim_mutex_* method.
//
// Though optional (static initialization is fine) this is required to support
// lock tracing. Assume it's (mostly) free and always call it if possible. This
// also allows us to swap in a non-slim lock for enhanced debugging if we run
// into threading issues.
void iree_slim_mutex_initialize(iree_slim_mutex_t* out_mutex);
#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS
// Deinitializes |mutex| (after a prior call to iree_slim_mutex_initialize).
// The mutex must not be held by any thread.
void iree_slim_mutex_deinitialize(iree_slim_mutex_t* mutex)
IREE_THREAD_ANNOTATION_ATTRIBUTE(locks_excluded(mutex));
// Locks the |mutex| and returns when held by the caller.
void iree_slim_mutex_lock(iree_slim_mutex_t* mutex)
IREE_THREAD_ANNOTATION_ATTRIBUTE(acquire_capability(mutex));
// Tries to lock the |mutex| and returns true if the caller holds the lock.
bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex)
IREE_THREAD_ANNOTATION_ATTRIBUTE(try_acquire_capability(true, mutex));
// Unlocks the |mutex|, which must be held by the caller.
void iree_slim_mutex_unlock(iree_slim_mutex_t* mutex)
IREE_THREAD_ANNOTATION_ATTRIBUTE(release_capability(mutex));
//==============================================================================
// iree_notification_t
//==============================================================================
// TODO(benvanik): add tracy support for watching the waits.
// A lightweight wait-free cross-thread notification mechanism.
// Classically called an 'event counter', these replace the use of condvars in
// lock-free code where you wouldn't want to guard a lock-free data structure
// with a lock.
//
// See:
// http://www.1024cores.net/home/lock-free-algorithms/eventcounts
// https://software.intel.com/en-us/forums/intel-threading-building-blocks/topic/299245
// https://github.com/r10a/Event-Counts
// https://github.com/facebook/folly/blob/master/folly/experimental/EventCount.h
// https://github.com/concurrencykit/ck/blob/master/include/ck_ec.h
typedef struct iree_notification_t {
#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
// Nothing required. Unused field to make compilers happy.
int reserved;
#elif !defined(IREE_PLATFORM_HAS_FUTEX)
// No futex on darwin/when using TSAN, so use mutex/condvar instead.
pthread_mutex_t mutex;
pthread_cond_t cond;
uint32_t epoch;
uint32_t waiters;
#else
iree_atomic_int64_t value;
#endif // IREE_PLATFORM_*
} iree_notification_t;
#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
#define IREE_NOTIFICATION_INIT \
{ IREE_ATOMIC_VAR_INIT(0) }
#elif !defined(IREE_PLATFORM_HAS_FUTEX)
#define IREE_NOTIFICATION_INIT \
{ PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER, 0, 0 }
#else
#define IREE_NOTIFICATION_INIT \
{ IREE_ATOMIC_VAR_INIT(0) }
#endif // notification type
// Initializes a notification to no waiters and an initial epoch of 0.
void iree_notification_initialize(iree_notification_t* out_notification);
// Deinitializes |notification| (after a prior call to
// iree_notification_initialize). No threads may be waiting on the notification.
void iree_notification_deinitialize(iree_notification_t* notification);
// Notifies up to |count| waiters of a change. Each waiter will wake and can
// check to see if they need to do any additional work.
// To notify all potential waiters pass IREE_ALL_WAITERS.
//
// Acts as (at least) a memory_order_release operation on the
// notification object. See the comment on iree_notification_commit_wait, which
// is the memory_order_acquire operation that is meant to pair with that.
void iree_notification_post(iree_notification_t* notification, int32_t count);
typedef uint32_t iree_wait_token_t; // opaque
// Prepares for a wait operation, returning a token that must be passed to
// iree_notification_commit_wait to perform the actual wait.
//
// Acts as a memory_order_acq_rel read-modify-write operation on the
// notification object. See the comment on iree_notification_commit_wait for a
// general explanation of acquire/release semantics in this context.
iree_wait_token_t iree_notification_prepare_wait(
iree_notification_t* notification);
// Commits a pending wait operation when the caller has ensured it must wait.
// Waiting will continue until a notification has been posted or |deadline_ns|
// is reached. Returns false if the deadline is reached before a notification is
// posted.
//
// If |spin_ns| is not IREE_DURATION_ZERO the wait _may_ spin for at least the
// specified duration before entering the system wait API.
//
// Acts as (at least) a memory_order_acquire operation on the notification
// object. This is meant to be paired with iree_notification_post, which is a
// memory_order_release operation. This means the following guarantee:
// When iree_notification_commit_wait returns on this thread T1 from having
// waited on another thread T2 calling iree_notification_post, all memory read
// and write operations performed on thread T2 prior to calling
// iree_notification_post are guaranteed to "happen-before" any subsequent
// memory read or write on thread T1.
bool iree_notification_commit_wait(iree_notification_t* notification,
iree_wait_token_t wait_token,
iree_duration_t spin_ns,
iree_time_t deadline_ns);
// Cancels a pending wait operation without blocking.
//
// Acts as (at least) a memory_order_relaxed barrier:
// Relaxed operation: there are no synchronization or ordering constraints
// imposed on other reads or writes, only this operation's atomicity is
// guaranteed.
void iree_notification_cancel_wait(iree_notification_t* notification);
// Returns true if the condition is true.
// |arg| is the |condition_arg| passed to the await function.
// Implementations must ensure they are coherent with their state values.
typedef bool (*iree_condition_fn_t)(void* arg);
// Blocks and waits until |condition_fn| returns true. Other threads must modify
// state checked by the |condition_fn| and post the notification.
// Returns true if the condition is true before |timeout| is reached. If the
// timeout is infinite then the return will always be true.
//
// Example:
// thread 1:
// bool check_flag_pred(void* arg) {
// return iree_atomic_int32_load((iree_atomic_int32_t*)arg,
// iree_memory_order_acquire) == 1;
// }
// iree_atomic_int32_t* flag = ...;
// iree_notification_await(&notification, check_flag_pred, flag);
// thread 2:
// iree_atomic_int32_store(flag, 1, iree_memory_order_release);
// iree_notification_post(&notification, IREE_ALL_WAITERS);
bool iree_notification_await(iree_notification_t* notification,
iree_condition_fn_t condition_fn,
void* condition_arg, iree_timeout_t timeout);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // IREE_BASE_INTERNAL_SYNCHRONIZATION_H_