Adding iree_processor_yield() for use in spin loops.
If we're going to spin we should at least be less bad about it.
Most processors use this as a strong hint that a particular core is
spinning in order to allow cooperatively-scheduled smt workloads a chance
to run. It also introduces latency that can help lighten the memory
system load that sitting and hammering the spin condition can cause.
diff --git a/runtime/src/iree/base/internal/synchronization.c b/runtime/src/iree/base/internal/synchronization.c
index a992086..08518d3 100644
--- a/runtime/src/iree/base/internal/synchronization.c
+++ b/runtime/src/iree/base/internal/synchronization.c
@@ -13,6 +13,10 @@
// Disabled.
+#elif defined(IREE_PLATFORM_WINDOWS)
+
+#include <intrin.h>
+
#elif defined(IREE_PLATFORM_EMSCRIPTEN)
#include <emscripten/threading.h>
@@ -54,6 +58,42 @@
IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
//==============================================================================
+// Cross-platform processor yield (where supported)
+//==============================================================================
+
+#if defined(IREE_COMPILER_MSVC)
+
+// MSVC uses architecture-specific intrinsics.
+
+static inline void iree_processor_yield(void) {
+#if defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+ // https://docs.microsoft.com/en-us/cpp/intrinsics/x86-intrinsics-list
+ _mm_pause();
+#elif defined(IREE_ARCH_ARM_64)
+ // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+ __yield();
+#else
+ // None available; we'll spin hard.
+#endif // IREE_ARCH_*
+}
+
+#else
+
+// Clang/GCC and compatibles use architecture-specific inline assembly.
+
+static inline void iree_processor_yield(void) {
+#if defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+ asm volatile("pause");
+#elif defined(IREE_ARCH_ARM_32) || defined(IREE_ARCH_ARM_64)
+ asm volatile("yield");
+#else
+ // None available; we'll spin hard.
+#endif // IREE_ARCH_*
+}
+
+#endif // IREE_COMPILER_*
+
+//==============================================================================
// Cross-platform futex mappings (where supported)
//==============================================================================
@@ -472,6 +512,7 @@
// TODO(benvanik): measure on real workload on ARM; maybe remove entirely.
int spin_count = 100;
for (int i = 0; i < spin_count && iree_slim_mutex_is_locked(value); ++i) {
+ iree_processor_yield();
value =
iree_atomic_load_int32(&mutex->value, iree_memory_order_relaxed);
}