runtime/src/iree/base/internal/synchronization.h - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 // NOTE: the best kind of synchronization is no synchronization; always try to
 // design your algorithm so that you don't need anything from this file :)
 // See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html

 #ifndef IREE_BASE_INTERNAL_SYNCHRONIZATION_H_
 #define IREE_BASE_INTERNAL_SYNCHRONIZATION_H_

 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>

 #include "iree/base/api.h"
 #include "iree/base/internal/atomics.h"
 #include "iree/base/target_platform.h"
 #include "iree/base/tracing.h"

 // NOTE: clang cannot support thread annotations in C code due to some
 // representational bugs... which means that we can't use it here. Boo.
 // There's some workarounds I've seen but getting TSAN working would be much
 // easier as a starting point.
 #if 0  // defined(IREE_COMPILER_CLANG)
 #define IREE_THREAD_ANNOTATION_ATTRIBUTE(x) __attribute__((x))
 #else
 #define IREE_THREAD_ANNOTATION_ATTRIBUTE(x)
 #endif  // IREE_COMPILER_CLANG

 #ifdef __cplusplus
 // Documents if a shared field or global variable needs to be protected by a
 // mutex. IREE_GUARDED_BY() allows the user to specify a particular mutex that
 // should be held when accessing the annotated variable.
 #define IREE_GUARDED_BY(x) IREE_THREAD_ANNOTATION_ATTRIBUTE(guarded_by(x))
 #else
 #define IREE_GUARDED_BY(x)
 #endif  // __cplusplus

 #ifdef __cplusplus
 // Like IREE_GUARDED_BY but specifies that the contents of a pointer are guarded
 // by a mutex instead of the pointer itself.
 #define IREE_PTR_GUARDED_BY(x) \
   IREE_THREAD_ANNOTATION_ATTRIBUTE(pt_guarded_by(x))
 #else
 #define IREE_PTR_GUARDED_BY(x)
 #endif  // __cplusplus

 // Allow users to fully disable all synchronization for systems that are known
 // to never need it. This removes our dependency on pthreads.
 #if !IREE_SYNCHRONIZATION_DISABLE_UNSAFE

 #if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_EMSCRIPTEN) || \
     defined(IREE_PLATFORM_LINUX) || defined(IREE_PLATFORM_WINDOWS)
 #define IREE_PLATFORM_HAS_FUTEX 1
 #endif  // IREE_PLATFORM_*

 #if defined(IREE_PLATFORM_APPLE)
 #include <os/lock.h>
 #endif  // IREE_PLATFORM_APPLE

 #if !defined(IREE_PLATFORM_WINDOWS)
 #include <pthread.h>
 #endif  // !IREE_PLATFORM_WINDOWS

 // We have the CRITICAL_SECTION path for now but Slim Reader/Writer lock (SRW)
 // is much better (and what std::mutex uses). SRW doesn't spin, though, and has
 // some other implications that don't quite line up with pthread_mutex_t on most
 // platforms. Once we have larger end-to-end benchmarks we should choose based
 // on workloads.
 #define IREE_MUTEX_USE_WIN32_SRW 1

 #endif  // !IREE_SYNCHRONIZATION_DISABLE_UNSAFE

 #ifdef __cplusplus
 extern "C" {
 #endif

 #define IREE_ALL_WAITERS INT32_MAX
 #define IREE_INFINITE_TIMEOUT_MS UINT32_MAX

 //==============================================================================
 // iree_mutex_t
 //==============================================================================

 // A normal fat mutex (ala std::mutex).
 // This may be implemented as a slim mutex on certain platforms but in the worst
 // case will be the native platform primitive (like pthread_mutex_t) and as such
 // should not be embedded in structures meant to be kept small.
 //
 // Windows: Slim Reader/Writer (SRW) Locks
 // All others: pthread_mutex_t
 typedef struct iree_mutex_t IREE_THREAD_ANNOTATION_ATTRIBUTE(
     capability("mutex")) {
 #if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
   int reserved;
 #elif defined(IREE_PLATFORM_WINDOWS) && defined(IREE_MUTEX_USE_WIN32_SRW)
   SRWLOCK value;
 #elif defined(IREE_PLATFORM_WINDOWS)
   CRITICAL_SECTION value;
 #else
   pthread_mutex_t value;
 #endif  // IREE_PLATFORM_*
 #if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_SLOW_LOCKS)
   uint32_t lock_id;
 #endif  // IREE_TRACING_FEATURE_SLOW_LOCKS
 } iree_mutex_t;

 #if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_SLOW_LOCKS)
 // Initializes |out_mutex| to the well-defined unlocked contents.
 // Must be called prior to using any other iree_mutex_* method.
 #define iree_mutex_initialize(out_mutex)                                      \
   static const iree_tracing_location_t TracyConcat(                           \
       __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__,     \
                                             (uint32_t)__LINE__, 0};           \
   iree_mutex_initialize_impl(&TracyConcat(__tracy_source_location, __LINE__), \
                              out_mutex);
 void iree_mutex_initialize_impl(const iree_tracing_location_t* src_loc,
                                 iree_mutex_t* out_mutex);
 #else
 // Initializes |out_mutex| to the well-defined unlocked contents.
 // Must be called prior to using any other iree_mutex_* method.
 void iree_mutex_initialize(iree_mutex_t* out_mutex);
 #endif  // IREE_TRACING_FEATURE_SLOW_LOCKS

 // Deinitializes |mutex| (after a prior call to iree_mutex_initialize).
 // The mutex must not be held by any thread.
 void iree_mutex_deinitialize(iree_mutex_t* mutex)
     IREE_THREAD_ANNOTATION_ATTRIBUTE(locks_excluded(mutex));

 // Locks the |mutex| and returns when held by the caller.
 void iree_mutex_lock(iree_mutex_t* mutex)
     IREE_THREAD_ANNOTATION_ATTRIBUTE(acquire_capability(mutex));

 // Tries to lock the |mutex| and returns true if the caller holds the lock.
 bool iree_mutex_try_lock(iree_mutex_t* mutex)
     IREE_THREAD_ANNOTATION_ATTRIBUTE(try_acquire_capability(true, mutex));

 // Unlocks the |mutex|, which must be held by the caller.
 void iree_mutex_unlock(iree_mutex_t* mutex)
     IREE_THREAD_ANNOTATION_ATTRIBUTE(release_capability(mutex));

 //==============================================================================
 // iree_slim_mutex_t
 //==============================================================================

 // TODO(benvanik): instrument with tracy; need to capture source location on
 // init and add storage for ID.

 // A lightweight unfair lock.
 // Depending on platform this is significantly smaller than a mutex (4-8 bytes
 // vs 64+ bytes), can always be statically initialized/requires no allocations,
 // and performs the minimal amount of work possible while still playing nicely
 // with the OS thread scheduler.
 //
 // Unlike a full mutex these don't have the ability to be shared across
 // processes (not something we care about), don't have a way to define timeouts,
 // and have only a binary held/unheld state. They are often an order of
 // magnitude faster in uncontended/lightly-contended code and the same
 // performance in highly-contended code, though, so it's worth it for locks that
 // be guarding small data structures (queue pointers, etc) and touched from many
 // threads. Since they are so lightweight it's possible to embed them per-object
 // instead of per-manager and change from a single highly-contended lock to
 // thousands of almost completely uncontended slim locks.
 //
 // Though these locks have lightweight return paths (using atomics ops, see
 // the below paragraph, with or without spinning), they always have a
 // heavyweight fallback path that ends up calling into the kernel to properly
 // let the thread wait. This is critical to avoid pathological cases under
 // contention and allowing for thread priority inheritance when there are
 // multiple threads competing that may otherwise be scheduled in a potentially
 // livelocking order.
 //
 // The "unfair" here comes from the fact that it's possible on certain platforms
 // for certain threads to never be able to acquire the lock in cases of
 // extremely high contention or widely disparate thread priority levels. This is
 // mitigated by ensuring only very small regions of code are guarded and that
 // there's enough work happening outside of the lock on any particular thread to
 // ensure that there's some chance of other threads being able to acquire it.
 //
 // Notes on weakly-ordered atomics (for the lightweight return paths)
 // ------------------------------------------------------------------
 //
 // The lightweight return paths (avoiding waiting in the kernel) are typically
 // implemented using acquire/release weakly-ordered atomics. When these code
 // paths are taken:
 //
 //   * iree_slim_mutex_lock is a read-modify-write operation on the
 //     mutex with memory_order_acquire.
 //   * iree_slim_mutex_unlock is a read-modify-write operation on the
 //     mutex with memory_order_release.
 //
 // This means the following guarantee for the caller of iree_slim_mutex_lock:
 //
 //   When iree_slim_mutex_lock returns on this thread T1 from having waited on
 //   another thread T2 calling iree_slim_mutex_unlock, all memory read and write
 //   operations performed on thread T2 prior to calling iree_slim_mutex_unlock
 //   are guaranteed to "happen-before" any subsequent memory read or write on
 //   thread T1.
 //
 // This is meant to be an implementation detail within the standard contract
 // for anything called a "mutex". The C++ standard is a good reference for what
 // it means to be a mutexin the context of the C11/C++11 memory model:
 // https://eel.is/c++draft/thread.mutex#requirements.mutex
 //
 // It is not trivial why the above-described memory orders happen to be
 // sufficient to meet these mutex requirements, so here is a FAQ:
 //
 // Q: Is it really OK with the memory model for mutex lock and unlock to be mere
 //    atomic operations, and wouldn't they at least need to have sequentially-
 //    consistent order?
 // A: https://eel.is/c++draft/thread.mutex.requirements.mutex.general  says:
 //      > The implementation provides lock and unlock operations, as described
 //      > below. For purposes of determining the existence of a data race, these
 //      > behave as atomic operations ([intro.multithread]). The lock and unlock
 //      > operations on a single mutex appears to occur in a single total order.
 //    Key here is "on a single mutex". There is no ordering requirement across
 //    operations on separate mutex objects. That is what sequentially-consistent
 //    atomics would be needed for. The only ordering requirement is among ops
 //    "on a single mutex" and that is what one can implement with careful use
 //    of acquire/release atomics.
 //
 // Q: Since iree_slim_mutex_{lock,unlock} are read-modify-write operations,
 //    shouldn't they have at least memory_order_acq_rel so that both the
 //    read and write aspects of each of them construct happens-before
 //    relationships with each other?
 // A: Separate answer regarting iree_slim_mutex_lock and iree_slim_mutex_unlock:
 //    * Why iree_slim_mutex_lock only needs acquire order, not release:
 //        When iree_slim_mutex_lock returns, the calling thread T1 holds the
 //        lock. A call to iree_slim_mutex_lock on another thread T2 can't return
 //        until thread T1 calls iree_slim_mutex_unlock, which already has
 //        memory_order_release.
 //    * Why iree_slim_mutex_unlock only needs release order, not acquire:
 //        By requirement of iree_slim_mutex_unlock, the calling thread had
 //        already called iree_slim_mutex_lock prior to calling
 //        iree_slim_mutex_unlock. Mutual exclusion implies that no other thread
 //        can have operated on this mutex object since that iree_slim_mutex_lock
 //        call on the calling thread.
 //
 // OS-specific implementation aspects (for the heavyweight return paths)
 // ---------------------------------------------------------------------
 //
 // MacOS/iOS: os_unfair_lock
 //   Spins and after a short backoff drops to a futex-like behavior of waiting
 //   in the kernel. Unfortunately real futexes aren't supported.
 // See:
 //   https://developer.apple.com/documentation/os/synchronization
 //   https://opensource.apple.com/source/libplatform/libplatform-125/src/os/lock.c.auto.html
 //
 // Emscripten: emscripten_futex_wait/emscripten_futex_wake
 //   Spins and after a short backoff drops to a futex-like behavior of waiting
 //   in the kernel.
 // See:
 //   https://github.com/emscripten-core/emscripten/blob/b43474f55aeb49083b9df74fdd0e52ec8decf788/system/include/emscripten/threading.h#L114-L120
 //   https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Atomics/wait
 //   https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Atomics/notify
 //
 // Windows: WaitOnAddress/WakeByAddress*
 //   Spins and after a short backoff drops to a futex and waits in the kernel.
 // See:
 //   https://docs.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-waitonaddress
 //   https://devblogs.microsoft.com/oldnewthing/20170601-00/?p=96265
 //
 // Linux/Android/others: futex
 //   Spins and after a short backoff drops to a futex and waits in the kernel.
 // See:
 //   http://locklessinc.com/articles/futex_cheat_sheet/
 //   https://man7.org/linux/man-pages/man2/futex.2.html
 //   https://eli.thegreenplace.net/2018/basics-of-futexes/
 //   https://bartoszmilewski.com/2008/09/01/thin-lock-vs-futex/
 typedef struct iree_slim_mutex_t IREE_THREAD_ANNOTATION_ATTRIBUTE(
     capability("mutex")) {
 #if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
   int reserved;
 #elif (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS)
   iree_mutex_t impl;  // re-route to slow mutex
 #elif defined(IREE_PLATFORM_APPLE)
   os_unfair_lock value;
 #elif defined(IREE_PLATFORM_WINDOWS) && defined(IREE_MUTEX_USE_WIN32_SRW)
   SRWLOCK value;
 #elif defined(IREE_PLATFORM_HAS_FUTEX)
   iree_atomic_int32_t value;
 #else
   iree_mutex_t impl;  // fallback
 #endif  // IREE_PLATFORM_*
 } iree_slim_mutex_t;

 #if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS)
 // Initializes |out_mutex| to the well-defined unlocked contents.
 // Must be called prior to using any other iree_slim_mutex_* method.
 #define iree_slim_mutex_initialize(out_mutex)                             \
   static const iree_tracing_location_t TracyConcat(                       \
       __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \
                                             (uint32_t)__LINE__, 0};       \
   iree_slim_mutex_initialize_impl(                                        \
       &TracyConcat(__tracy_source_location, __LINE__), out_mutex);
 void iree_slim_mutex_initialize_impl(const iree_tracing_location_t* src_loc,
                                      iree_slim_mutex_t* out_mutex);
 #else
 // Initializes |out_mutex| to the well-defined unlocked contents.
 // Must be called prior to using any other iree_slim_mutex_* method.
 //
 // Though optional (static initialization is fine) this is required to support
 // lock tracing. Assume it's (mostly) free and always call it if possible. This
 // also allows us to swap in a non-slim lock for enhanced debugging if we run
 // into threading issues.
 void iree_slim_mutex_initialize(iree_slim_mutex_t* out_mutex);
 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS

 // Deinitializes |mutex| (after a prior call to iree_slim_mutex_initialize).
 // The mutex must not be held by any thread.
 void iree_slim_mutex_deinitialize(iree_slim_mutex_t* mutex)
     IREE_THREAD_ANNOTATION_ATTRIBUTE(locks_excluded(mutex));

 // Locks the |mutex| and returns when held by the caller.
 void iree_slim_mutex_lock(iree_slim_mutex_t* mutex)
     IREE_THREAD_ANNOTATION_ATTRIBUTE(acquire_capability(mutex));

 // Tries to lock the |mutex| and returns true if the caller holds the lock.
 bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex)
     IREE_THREAD_ANNOTATION_ATTRIBUTE(try_acquire_capability(true, mutex));

 // Unlocks the |mutex|, which must be held by the caller.
 void iree_slim_mutex_unlock(iree_slim_mutex_t* mutex)
     IREE_THREAD_ANNOTATION_ATTRIBUTE(release_capability(mutex));

 //==============================================================================
 // iree_notification_t
 //==============================================================================

 // TODO(benvanik): add tracy support for watching the waits.

 // A lightweight wait-free cross-thread notification mechanism.
 // Classically called an 'event counter', these replace the use of condvars in
 // lock-free code where you wouldn't want to guard a lock-free data structure
 // with a lock.
 //
 // See:
 // http://www.1024cores.net/home/lock-free-algorithms/eventcounts
 // https://software.intel.com/en-us/forums/intel-threading-building-blocks/topic/299245
 // https://github.com/r10a/Event-Counts
 // https://github.com/facebook/folly/blob/master/folly/experimental/EventCount.h
 // https://github.com/concurrencykit/ck/blob/master/include/ck_ec.h
 typedef struct iree_notification_t {
 #if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
   // Nothing required. Unused field to make compilers happy.
   int reserved;
 #elif !defined(IREE_PLATFORM_HAS_FUTEX)
   // No futex on darwin/when using TSAN, so use mutex/condvar instead.
   pthread_mutex_t mutex;
   pthread_cond_t cond;
   uint32_t epoch;
   uint32_t waiters;
 #else
   iree_atomic_int64_t value;
 #endif  // IREE_PLATFORM_*
 } iree_notification_t;

 #if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
 #define IREE_NOTIFICATION_INIT \
   { IREE_ATOMIC_VAR_INIT(0) }
 #elif !defined(IREE_PLATFORM_HAS_FUTEX)
 #define IREE_NOTIFICATION_INIT \
   { PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER, 0, 0 }
 #else
 #define IREE_NOTIFICATION_INIT \
   { IREE_ATOMIC_VAR_INIT(0) }
 #endif  // notification type

 // Initializes a notification to no waiters and an initial epoch of 0.
 void iree_notification_initialize(iree_notification_t* out_notification);

 // Deinitializes |notification| (after a prior call to
 // iree_notification_initialize). No threads may be waiting on the notification.
 void iree_notification_deinitialize(iree_notification_t* notification);

 // Notifies up to |count| waiters of a change. Each waiter will wake and can
 // check to see if they need to do any additional work.
 // To notify all potential waiters pass IREE_ALL_WAITERS.
 //
 // Acts as (at least) a memory_order_release operation on the
 // notification object. See the comment on iree_notification_commit_wait, which
 // is the memory_order_acquire operation that is meant to pair with that.
 void iree_notification_post(iree_notification_t* notification, int32_t count);

 typedef uint32_t iree_wait_token_t;  // opaque

 // Prepares for a wait operation, returning a token that must be passed to
 // iree_notification_commit_wait to perform the actual wait.
 //
 // Acts as a memory_order_acq_rel read-modify-write operation on the
 // notification object. See the comment on iree_notification_commit_wait for a
 // general explanation of acquire/release semantics in this context.
 iree_wait_token_t iree_notification_prepare_wait(
     iree_notification_t* notification);

 // Commits a pending wait operation when the caller has ensured it must wait.
 // Waiting will continue until a notification has been posted or |deadline_ns|
 // is reached. Returns false if the deadline is reached before a notification is
 // posted.
 //
 // If |spin_ns| is not IREE_DURATION_ZERO the wait _may_ spin for at least the
 // specified duration before entering the system wait API.
 //
 // Acts as (at least) a memory_order_acquire operation on the notification
 // object. This is meant to be paired with iree_notification_post, which is a
 // memory_order_release operation. This means the following guarantee:
 // When iree_notification_commit_wait returns on this thread T1 from having
 // waited on another thread T2 calling iree_notification_post, all memory read
 // and write operations performed on thread T2 prior to calling
 // iree_notification_post are guaranteed to "happen-before" any subsequent
 // memory read or write on thread T1.
 bool iree_notification_commit_wait(iree_notification_t* notification,
                                    iree_wait_token_t wait_token,
                                    iree_duration_t spin_ns,
                                    iree_time_t deadline_ns);

 // Cancels a pending wait operation without blocking.
 //
 // Acts as (at least) a memory_order_relaxed barrier:
 //   Relaxed operation: there are no synchronization or ordering constraints
 //   imposed on other reads or writes, only this operation's atomicity is
 //   guaranteed.
 void iree_notification_cancel_wait(iree_notification_t* notification);

 // Returns true if the condition is true.
 // |arg| is the |condition_arg| passed to the await function.
 // Implementations must ensure they are coherent with their state values.
 typedef bool (*iree_condition_fn_t)(void* arg);

 // Blocks and waits until |condition_fn| returns true. Other threads must modify
 // state checked by the |condition_fn| and post the notification.
 // Returns true if the condition is true before |timeout| is reached. If the
 // timeout is infinite then the return will always be true.
 //
 // Example:
 //  thread 1:
 //   bool check_flag_pred(void* arg) {
 //     return iree_atomic_int32_load((iree_atomic_int32_t*)arg,
 //                                   iree_memory_order_acquire) == 1;
 //   }
 //   iree_atomic_int32_t* flag = ...;
 //   iree_notification_await(&notification, check_flag_pred, flag);
 //  thread 2:
 //   iree_atomic_int32_store(flag, 1, iree_memory_order_release);
 //   iree_notification_post(&notification, IREE_ALL_WAITERS);
 bool iree_notification_await(iree_notification_t* notification,
                              iree_condition_fn_t condition_fn,
                              void* condition_arg, iree_timeout_t timeout);

 #ifdef __cplusplus
 }  // extern "C"
 #endif

 #endif  // IREE_BASE_INTERNAL_SYNCHRONIZATION_H_