Adding iree_processor_yield() for use in spin loops. If we're going to spin we should at least be less bad about it. Most processors use this as a strong hint that a particular core is spinning in order to allow cooperatively-scheduled smt workloads a chance to run. It also introduces latency that can help lighten the memory system load that sitting and hammering the spin condition can cause.

commit: ec36a086261dcde6fcf2ea35e7f875ad62a26f46 [log] [tgz]
author: Ben Vanik <ben.vanik@gmail.com> Tue Aug 30 17:39:03 2022 -0700
committer: Ben Vanik <ben.vanik@gmail.com> Tue Aug 30 21:16:40 2022 -0700
tree: 1fb8227181ea7e8266957b25b7b4b5d39818079e
parent: 2b8438fc190af8c3f9cf0a1ac3ccfda254ecb4be [diff]
diff --git a/runtime/src/iree/base/internal/synchronization.c b/runtime/src/iree/base/internal/synchronization.c
index a992086..08518d3 100644
--- a/runtime/src/iree/base/internal/synchronization.c
+++ b/runtime/src/iree/base/internal/synchronization.c

@@ -13,6 +13,10 @@
 
 // Disabled.
 
+#elif defined(IREE_PLATFORM_WINDOWS)
+
+#include <intrin.h>
+
 #elif defined(IREE_PLATFORM_EMSCRIPTEN)
 
 #include <emscripten/threading.h>
@@ -54,6 +58,42 @@
   IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
 
 //==============================================================================
+// Cross-platform processor yield (where supported)
+//==============================================================================
+
+#if defined(IREE_COMPILER_MSVC)
+
+// MSVC uses architecture-specific intrinsics.
+
+static inline void iree_processor_yield(void) {
+#if defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+  // https://docs.microsoft.com/en-us/cpp/intrinsics/x86-intrinsics-list
+  _mm_pause();
+#elif defined(IREE_ARCH_ARM_64)
+  // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+  __yield();
+#else
+  // None available; we'll spin hard.
+#endif  // IREE_ARCH_*
+}
+
+#else
+
+// Clang/GCC and compatibles use architecture-specific inline assembly.
+
+static inline void iree_processor_yield(void) {
+#if defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+  asm volatile("pause");
+#elif defined(IREE_ARCH_ARM_32) || defined(IREE_ARCH_ARM_64)
+  asm volatile("yield");
+#else
+  // None available; we'll spin hard.
+#endif  // IREE_ARCH_*
+}
+
+#endif  // IREE_COMPILER_*
+
+//==============================================================================
 // Cross-platform futex mappings (where supported)
 //==============================================================================
 
@@ -472,6 +512,7 @@
       // TODO(benvanik): measure on real workload on ARM; maybe remove entirely.
       int spin_count = 100;
       for (int i = 0; i < spin_count && iree_slim_mutex_is_locked(value); ++i) {
+        iree_processor_yield();
         value =
             iree_atomic_load_int32(&mutex->value, iree_memory_order_relaxed);
       }
commit	ec36a086261dcde6fcf2ea35e7f875ad62a26f46	[log] [tgz]
author	Ben Vanik <ben.vanik@gmail.com>	Tue Aug 30 17:39:03 2022 -0700
committer	Ben Vanik <ben.vanik@gmail.com>	Tue Aug 30 21:16:40 2022 -0700
tree	1fb8227181ea7e8266957b25b7b4b5d39818079e
parent	2b8438fc190af8c3f9cf0a1ac3ccfda254ecb4be [diff]