Adding iree_processor_yield() for use in spin loops.
If we're going to spin we should at least be less bad about it.
Most processors use this as a strong hint that a particular core is
spinning in order to allow cooperatively-scheduled smt workloads a chance
to run. It also introduces latency that can help lighten the memory
system load that sitting and hammering the spin condition can cause.
diff --git a/runtime/src/iree/base/internal/synchronization.c b/runtime/src/iree/base/internal/synchronization.c
index a992086..08518d3 100644
--- a/runtime/src/iree/base/internal/synchronization.c
+++ b/runtime/src/iree/base/internal/synchronization.c
@@ -13,6 +13,10 @@
 
 // Disabled.
 
+#elif defined(IREE_PLATFORM_WINDOWS)
+
+#include <intrin.h>
+
 #elif defined(IREE_PLATFORM_EMSCRIPTEN)
 
 #include <emscripten/threading.h>
@@ -54,6 +58,42 @@
   IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
 
 //==============================================================================
+// Cross-platform processor yield (where supported)
+//==============================================================================
+
+#if defined(IREE_COMPILER_MSVC)
+
+// MSVC uses architecture-specific intrinsics.
+
+static inline void iree_processor_yield(void) {
+#if defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+  // https://docs.microsoft.com/en-us/cpp/intrinsics/x86-intrinsics-list
+  _mm_pause();
+#elif defined(IREE_ARCH_ARM_64)
+  // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+  __yield();
+#else
+  // None available; we'll spin hard.
+#endif  // IREE_ARCH_*
+}
+
+#else
+
+// Clang/GCC and compatibles use architecture-specific inline assembly.
+
+static inline void iree_processor_yield(void) {
+#if defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+  asm volatile("pause");
+#elif defined(IREE_ARCH_ARM_32) || defined(IREE_ARCH_ARM_64)
+  asm volatile("yield");
+#else
+  // None available; we'll spin hard.
+#endif  // IREE_ARCH_*
+}
+
+#endif  // IREE_COMPILER_*
+
+//==============================================================================
 // Cross-platform futex mappings (where supported)
 //==============================================================================
 
@@ -472,6 +512,7 @@
       // TODO(benvanik): measure on real workload on ARM; maybe remove entirely.
       int spin_count = 100;
       for (int i = 0; i < spin_count && iree_slim_mutex_is_locked(value); ++i) {
+        iree_processor_yield();
         value =
             iree_atomic_load_int32(&mutex->value, iree_memory_order_relaxed);
       }