Adding semaphore creation and wait flags for controlling behavior. (#21619)

This set of changes adds arguments to semaphore APIs that allow
applications to signal their intended usage of semaphores they create to
implementations (which queues they may be used on, whether they need
interrupt services, and whether they are exportable) and when they wait
what kind of latency they are expecting as a hint for the wait behavior.

No implementations are updated here, all of these flags are for future
work. Behavior is defaulted to what it is today before these additions
(interrupts/exports requested, no special handling of queue affinity,
and blocking waits). This is the most general behavior that should
always be correct with only usage of the new flags potentially creating
problems.

Fixes #21615.
diff --git a/experimental/web/sample_webgpu/main.c b/experimental/web/sample_webgpu/main.c
index b30ccea..8b08964 100644
--- a/experimental/web/sample_webgpu/main.c
+++ b/experimental/web/sample_webgpu/main.c
@@ -786,8 +786,9 @@
   }
   iree_hal_semaphore_t* signal_semaphore = NULL;
   if (iree_status_is_ok(status)) {
-    status = iree_hal_semaphore_create(
-        device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &signal_semaphore);
+    status = iree_hal_semaphore_create(device, IREE_HAL_QUEUE_AFFINITY_ANY,
+                                       0ull, IREE_HAL_SEMAPHORE_FLAG_DEFAULT,
+                                       &signal_semaphore);
   }
   uint64_t signal_value = 1ull;
   if (iree_status_is_ok(status)) {
@@ -808,7 +809,8 @@
   //   (requires moving off of nop_semaphore and wait source import)
   if (iree_status_is_ok(status)) {
     status = iree_hal_semaphore_wait(signal_semaphore, signal_value,
-                                     iree_infinite_timeout());
+                                     iree_infinite_timeout(),
+                                     IREE_HAL_WAIT_FLAG_DEFAULT);
   }
   iree_hal_command_buffer_release(transfer_command_buffer);
   iree_hal_semaphore_release(signal_semaphore);
diff --git a/experimental/webgpu/webgpu_device.c b/experimental/webgpu/webgpu_device.c
index 8d9bdb0..d80e7ac 100644
--- a/experimental/webgpu/webgpu_device.c
+++ b/experimental/webgpu/webgpu_device.c
@@ -300,8 +300,9 @@
 }
 
 static iree_status_t iree_hal_webgpu_device_create_semaphore(
-    iree_hal_device_t* base_device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
   return iree_hal_webgpu_nop_semaphore_create(
       initial_value, device->host_allocator, out_semaphore);
@@ -322,8 +323,9 @@
     iree_device_size_t allocation_size, iree_hal_alloca_flags_t flags,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   // TODO(benvanik): queue-ordered allocations.
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
-                                                    iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_RETURN_IF_ERROR(
       iree_hal_allocator_allocate_buffer(iree_hal_device_allocator(base_device),
                                          params, allocation_size, out_buffer));
@@ -401,8 +403,9 @@
   // to change a bit to properly support waiting on host-signaled semaphores.
   // All work is ordered against the WebGPU queues and there's only one queue so
   // there's really not much to do.
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
-                                                    iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT));
 
   // TODO(benvanik): propagate errors to semaphores.
   IREE_RETURN_IF_ERROR(
@@ -421,7 +424,8 @@
 
 static iree_status_t iree_hal_webgpu_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   return iree_make_status(
       IREE_STATUS_UNIMPLEMENTED,
       "iree_hal_webgpu_device_wait_semaphores not yet implemented");
diff --git a/integrations/pjrt/src/iree_pjrt/common/api_impl.cc b/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
index f91c8b1..4d375cd 100644
--- a/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
+++ b/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
@@ -535,7 +535,8 @@
 
   iree::vm::ref<iree_hal_semaphore_t> semaphore;
   IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(
-      device_.device(), 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+      device_.device(), IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+      IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
 
   // Signaled when `dst_buffer` is ready to be consumed.
   iree::vm::ref<iree_hal_fence_t> dst_buffer_ready_fence;
@@ -818,9 +819,11 @@
       /*param_count=*/0, /*params=*/nullptr, client_.host_allocator(),
       &device_));
   IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(
-      device(), 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &main_timeline_));
+      device(), IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+      IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &main_timeline_));
   IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(
-      device(), 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &transfer_timeline_));
+      device(), IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+      IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &transfer_timeline_));
 
   return iree_ok_status();
 }
@@ -1714,8 +1717,8 @@
     signal_thread_ = std::make_unique<std::thread>(
         [](EventInstance* event_instance,
            iree::vm::ref<iree_hal_fence_t> fence) {
-          iree_status_t wait_status =
-              iree_hal_fence_wait(fence.get(), iree_infinite_timeout());
+          iree_status_t wait_status = iree_hal_fence_wait(
+              fence.get(), iree_infinite_timeout(), IREE_HAL_WAIT_FLAG_DEFAULT);
           event_instance->SignalReady(wait_status);
         },
         this, std::move(fence));
diff --git a/runtime/bindings/python/hal.cc b/runtime/bindings/python/hal.cc
index 52f06ad..18f0feb 100644
--- a/runtime/bindings/python/hal.cc
+++ b/runtime/bindings/python/hal.cc
@@ -383,10 +383,10 @@
 
 HalSemaphore HalDevice::CreateSemaphore(uint64_t initial_value) {
   iree_hal_semaphore_t* out_sem;
-  CheckApiStatus(
-      iree_hal_semaphore_create(raw_ptr(), initial_value,
-                                IREE_HAL_SEMAPHORE_FLAG_NONE, &out_sem),
-      "creating semaphore");
+  CheckApiStatus(iree_hal_semaphore_create(
+                     raw_ptr(), IREE_HAL_QUEUE_AFFINITY_ANY, initial_value,
+                     IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &out_sem),
+                 "creating semaphore");
   return HalSemaphore::StealFromRawPtr(out_sem);
 }
 
@@ -1678,7 +1678,8 @@
             uint64_t unused_value;
             {
               py::gil_scoped_release release;
-              status = iree_hal_semaphore_wait(self.raw_ptr(), payload, t);
+              status = iree_hal_semaphore_wait(self.raw_ptr(), payload, t,
+                                               IREE_HAL_WAIT_FLAG_DEFAULT);
             }
             if (iree_status_is_deadline_exceeded(status)) {
               // Time out.
@@ -1821,7 +1822,8 @@
             iree_status_t status;
             {
               py::gil_scoped_release release;
-              status = iree_hal_fence_wait(self.raw_ptr(), t);
+              status = iree_hal_fence_wait(self.raw_ptr(), t,
+                                           IREE_HAL_WAIT_FLAG_DEFAULT);
             }
             if (iree_status_is_deadline_exceeded(status)) {
               // Time out.
diff --git a/runtime/bindings/python/loop.cc b/runtime/bindings/python/loop.cc
index 7ab8727..f0ddf13 100644
--- a/runtime/bindings/python/loop.cc
+++ b/runtime/bindings/python/loop.cc
@@ -35,10 +35,10 @@
       : device_(std::move(device)), loop_(std::move(loop)) {
     IREE_PY_TRACEF("new HalDeviceLoopBridge (%p)", this);
     iree_slim_mutex_initialize(&mu_);
-    CheckApiStatus(
-        iree_hal_semaphore_create(device_.raw_ptr(), 0,
-                                  IREE_HAL_SEMAPHORE_FLAG_NONE, &control_sem_),
-        "create semaphore");
+    CheckApiStatus(iree_hal_semaphore_create(
+                       device_.raw_ptr(), IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                       IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &control_sem_),
+                   "create semaphore");
 
     loop_call_soon_ = loop_.attr("call_soon_threadsafe");
 
@@ -173,7 +173,7 @@
           device_.raw_ptr(), IREE_HAL_WAIT_MODE_ANY,
           {wait_semaphores.size(), wait_semaphores.data(),
            wait_payloads.data()},
-          iree_infinite_timeout());
+          iree_infinite_timeout(), IREE_HAL_WAIT_FLAG_DEFAULT);
       if (!iree_status_is_ok(status)) {
         py::gil_scoped_acquire acquire_gil;
         CheckApiStatus(
@@ -183,8 +183,8 @@
       status = iree_hal_semaphore_query(control_sem_, &next_control_wakeup);
       if (!iree_status_is_ok(status)) {
         py::gil_scoped_acquire acquire_gil;
-        CheckApiStatus(
-            status, "iree_hal_device_wait_semaphores from HalDeviceLoopBridge");
+        CheckApiStatus(status,
+                       "iree_hal_semaphore_query from HalDeviceLoopBridge");
       }
       next_control_wakeup += 1;
       IREE_PY_TRACEF("HalDeviceLoopBridge::Run(%p): Loop end", this);
diff --git a/runtime/src/iree/hal/buffer_transfer.c b/runtime/src/iree/hal/buffer_transfer.c
index 47729b6..8d95ff0 100644
--- a/runtime/src/iree/hal/buffer_transfer.c
+++ b/runtime/src/iree/hal/buffer_transfer.c
@@ -63,7 +63,8 @@
   // idle.
   iree_hal_semaphore_t* fence_semaphore = NULL;
   iree_status_t status = iree_hal_semaphore_create(
-      device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &fence_semaphore);
+      device, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+      IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &fence_semaphore);
   uint64_t signal_value = 1ull;
   if (iree_status_is_ok(status)) {
     iree_hal_semaphore_list_t wait_semaphores = {
@@ -82,7 +83,8 @@
         IREE_HAL_EXECUTE_FLAG_NONE);
   }
   if (iree_status_is_ok(status)) {
-    status = iree_hal_semaphore_wait(fence_semaphore, signal_value, timeout);
+    status = iree_hal_semaphore_wait(fence_semaphore, signal_value, timeout,
+                                     IREE_HAL_WAIT_FLAG_DEFAULT);
   }
 
   iree_hal_command_buffer_release(command_buffer);
diff --git a/runtime/src/iree/hal/cts/cts_test_base.h b/runtime/src/iree/hal/cts/cts_test_base.h
index f0bbdf9..adea3c4 100644
--- a/runtime/src/iree/hal/cts/cts_test_base.h
+++ b/runtime/src/iree/hal/cts/cts_test_base.h
@@ -225,7 +225,8 @@
     // One signal semaphore from 0 -> 1.
     iree_hal_semaphore_t* signal_semaphore = NULL;
     IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(
-        device_, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &signal_semaphore));
+        device_, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+        IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &signal_semaphore));
     uint64_t target_payload_value = 1ull;
     iree_hal_semaphore_list_t signal_semaphores = {
         /*count=*/1,
@@ -239,7 +240,8 @@
         IREE_HAL_EXECUTE_FLAG_NONE);
     if (iree_status_is_ok(status)) {
       status = iree_hal_semaphore_wait(signal_semaphore, target_payload_value,
-                                       iree_infinite_timeout());
+                                       iree_infinite_timeout(),
+                                       IREE_HAL_WAIT_FLAG_DEFAULT);
     }
 
     iree_hal_semaphore_release(signal_semaphore);
@@ -260,8 +262,9 @@
 
   iree_hal_semaphore_t* CreateSemaphore() {
     iree_hal_semaphore_t* semaphore = NULL;
-    IREE_EXPECT_OK(iree_hal_semaphore_create(
-        device_, 0, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+    IREE_EXPECT_OK(
+        iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                  IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
     return semaphore;
   }
 
diff --git a/runtime/src/iree/hal/cts/file_test.h b/runtime/src/iree/hal/cts/file_test.h
index 9c8c8f9..22f7f94 100644
--- a/runtime/src/iree/hal/cts/file_test.h
+++ b/runtime/src/iree/hal/cts/file_test.h
@@ -93,8 +93,9 @@
   CreatePatternedDeviceBuffer(file_size, 0xCD, &buffer);
 
   iree_hal_semaphore_t* semaphore = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
   iree_hal_fence_t* wait_fence = NULL;
   IREE_ASSERT_OK(iree_hal_fence_create_at(
       semaphore, 1ull, iree_allocator_system(), &wait_fence));
@@ -113,7 +114,8 @@
       /*source_offset=*/0, /*target_buffer=*/buffer, /*target_offset=*/0,
       /*length=*/file_size, IREE_HAL_READ_FLAG_NONE));
 
-  IREE_ASSERT_OK(iree_hal_fence_wait(signal_fence, iree_infinite_timeout()));
+  IREE_ASSERT_OK(iree_hal_fence_wait(signal_fence, iree_infinite_timeout(),
+                                     IREE_HAL_WAIT_FLAG_DEFAULT));
   iree_hal_fence_release(wait_fence);
   iree_hal_fence_release(signal_fence);
   iree_hal_semaphore_release(semaphore);
diff --git a/runtime/src/iree/hal/cts/semaphore_submission_test.h b/runtime/src/iree/hal/cts/semaphore_submission_test.h
index df6f148..09cc0ef 100644
--- a/runtime/src/iree/hal/cts/semaphore_submission_test.h
+++ b/runtime/src/iree/hal/cts/semaphore_submission_test.h
@@ -34,8 +34,9 @@
   IREE_ASSERT_OK(iree_hal_device_queue_barrier(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY, iree_hal_semaphore_list_empty(),
       signal_semaphores, IREE_HAL_EXECUTE_FLAG_NONE));
-  IREE_ASSERT_OK(
-      iree_hal_semaphore_wait(signal_semaphore, 1, iree_infinite_timeout()));
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(signal_semaphore, 1,
+                                         iree_infinite_timeout(),
+                                         IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_semaphore_release(signal_semaphore);
 }
@@ -56,8 +57,9 @@
       device_, IREE_HAL_QUEUE_AFFINITY_ANY, iree_hal_semaphore_list_empty(),
       signal_semaphores, command_buffer, iree_hal_buffer_binding_table_empty(),
       IREE_HAL_EXECUTE_FLAG_NONE));
-  IREE_ASSERT_OK(
-      iree_hal_semaphore_wait(signal_semaphore, 1, iree_infinite_timeout()));
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(signal_semaphore, 1,
+                                         iree_infinite_timeout(),
+                                         IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_command_buffer_release(command_buffer);
   iree_hal_semaphore_release(signal_semaphore);
@@ -77,7 +79,8 @@
   };
   iree_hal_semaphore_t* signal_semaphore = NULL;
   IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 100, IREE_HAL_SEMAPHORE_FLAG_NONE, &signal_semaphore));
+      device_, IREE_HAL_QUEUE_AFFINITY_ANY, 100ull,
+      IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &signal_semaphore));
   uint64_t signal_payload_values[] = {101};
   iree_hal_semaphore_list_t signal_semaphores = {
       1,
@@ -95,8 +98,9 @@
 
   // Signal the wait semaphore, work should begin and complete.
   IREE_ASSERT_OK(iree_hal_semaphore_signal(wait_semaphore, 1));
-  IREE_ASSERT_OK(
-      iree_hal_semaphore_wait(signal_semaphore, 101, iree_infinite_timeout()));
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(signal_semaphore, 101,
+                                         iree_infinite_timeout(),
+                                         IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_command_buffer_release(command_buffer);
   iree_hal_semaphore_release(wait_semaphore);
@@ -141,8 +145,8 @@
   IREE_ASSERT_OK(iree_hal_semaphore_signal(wait_semaphore_1, 1));
   IREE_ASSERT_OK(iree_hal_semaphore_signal(wait_semaphore_2, 1));
 
-  IREE_ASSERT_OK(
-      iree_hal_semaphore_list_wait(signal_semaphores, iree_infinite_timeout()));
+  IREE_ASSERT_OK(iree_hal_semaphore_list_wait(
+      signal_semaphores, iree_infinite_timeout(), IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_command_buffer_release(command_buffer);
   iree_hal_semaphore_release(wait_semaphore_1);
@@ -182,7 +186,8 @@
   // Start another thread and have it wait.
   std::thread thread([&]() {
     IREE_ASSERT_OK(iree_hal_semaphore_wait(
-        host_wait_semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+        host_wait_semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+        IREE_HAL_WAIT_FLAG_DEFAULT));
     IREE_ASSERT_OK(iree_hal_semaphore_signal(host_signal_semaphore, 1));
   });
 
@@ -204,7 +209,8 @@
       main_payload_values};
   IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ALL, main_wait_semaphores,
-      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   thread.join();
 
   iree_hal_command_buffer_release(command_buffer);
@@ -247,7 +253,8 @@
   // Start another thread and have it wait.
   std::thread thread([&]() {
     IREE_ASSERT_OK(iree_hal_semaphore_wait(
-        host_wait_semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+        host_wait_semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+        IREE_HAL_WAIT_FLAG_DEFAULT));
     IREE_ASSERT_OK(iree_hal_semaphore_signal(host_signal_semaphore, 1));
   });
 
@@ -267,7 +274,8 @@
       main_payload_values};
   IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ANY, main_wait_semaphores,
-      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
 
   // Check that the device has signaled but the host thread hasn't.
   CheckSemaphoreValue(host_signal_semaphore, 0);
@@ -316,7 +324,8 @@
   // Start another thread and have it wait.
   std::thread thread([&]() {
     IREE_ASSERT_OK(iree_hal_semaphore_wait(
-        host_wait_semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+        host_wait_semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+        IREE_HAL_WAIT_FLAG_DEFAULT));
     IREE_ASSERT_OK(iree_hal_semaphore_signal(host_signal_semaphore, 1));
   });
 
@@ -337,7 +346,8 @@
       main_payload_values};
   IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ANY, main_wait_semaphores,
-      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   thread.join();
 
   // Check that the host thread has signaled but the device hasn't.
@@ -346,9 +356,9 @@
 
   // Signal and wait for the device to complete too.
   IREE_ASSERT_OK(iree_hal_semaphore_signal(device_wait_semaphore, 1));
-  IREE_ASSERT_OK(
-      iree_hal_semaphore_wait(device_signal_semaphore, 1,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(
+      device_signal_semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_command_buffer_release(command_buffer);
   iree_hal_semaphore_release(host_wait_semaphore);
@@ -402,13 +412,15 @@
   // Wait on the intermediate semaphore and check its value.
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore1, semaphore_signal_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore1, semaphore_signal_value);
 
   // Wait on the second semaphore and check its value.
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore2, semaphore_signal_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore2, semaphore_signal_value);
 
   iree_hal_command_buffer_release(command_buffer1);
@@ -480,14 +492,16 @@
   // Wait and check that semaphore values have advanced.
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore21, semaphore_signal_wait_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore21, semaphore_signal_wait_value);
   // semaphore11 must have also advanced because semaphore21 has advanced.
   CheckSemaphoreValue(semaphore11, semaphore_signal_wait_value);
 
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore22, semaphore_signal_wait_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore22, semaphore_signal_wait_value);
 
   iree_hal_semaphore_release(semaphore11);
@@ -571,14 +585,16 @@
   // Wait and check that semaphore values have advanced.
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore21, semaphore2x_signal_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore21, semaphore2x_signal_value);
   // semaphore11 must have advanced, because semaphore22 has advanced already.
   CheckSemaphoreValue(semaphore11, command_buffer11_semaphore11_signal_value);
 
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore22, semaphore2x_signal_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore22, semaphore2x_signal_value);
 
   iree_hal_semaphore_release(semaphore11);
@@ -653,7 +669,8 @@
   // Wait and check that semaphore3 has advanced.
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore3, semaphore_signal_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore3, semaphore_signal_value);
 
   signal_thread.join();
@@ -708,17 +725,19 @@
   std::thread thread11([&]() {
     IREE_ASSERT_OK(
         iree_hal_semaphore_wait(semaphore11, signal_value,
-                                iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                                iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                                IREE_HAL_WAIT_FLAG_DEFAULT));
   });
   std::thread thread12([&]() {
     IREE_ASSERT_OK(
         iree_hal_semaphore_wait(semaphore12, signal_value,
-                                iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                                iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                                IREE_HAL_WAIT_FLAG_DEFAULT));
   });
   std::thread thread2([&]() {
-    IREE_ASSERT_OK(
-        iree_hal_semaphore_wait(semaphore2, signal_value,
-                                iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+    IREE_ASSERT_OK(iree_hal_semaphore_wait(
+        semaphore2, signal_value, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+        IREE_HAL_WAIT_FLAG_DEFAULT));
   });
 
   // Submit command_buffer1.
@@ -788,7 +807,8 @@
 
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore2, semaphore2_signal_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore2, semaphore2_signal_value);
 
   iree_hal_semaphore_release(semaphore1);
@@ -836,7 +856,8 @@
   // the waiting would have begun.
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore2, semaphore2_signal_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore2, semaphore2_signal_value);
 
   signal_thread.join();
@@ -881,7 +902,8 @@
 
   iree_status_t wait_status =
       iree_hal_semaphore_wait(semaphore2, semaphore2_signal_value,
-                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE));
+                              iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+                              IREE_HAL_WAIT_FLAG_DEFAULT);
   EXPECT_EQ(iree_status_code(wait_status), IREE_STATUS_ABORTED);
   uint64_t value = 1234;
   iree_status_t query_status = iree_hal_semaphore_query(semaphore2, &value);
diff --git a/runtime/src/iree/hal/cts/semaphore_test.h b/runtime/src/iree/hal/cts/semaphore_test.h
index 7d0592f..1e5e313 100644
--- a/runtime/src/iree/hal/cts/semaphore_test.h
+++ b/runtime/src/iree/hal/cts/semaphore_test.h
@@ -26,8 +26,9 @@
 // Tests that a semaphore that is unused properly cleans itself up.
 TEST_F(SemaphoreTest, NoOp) {
   iree_hal_semaphore_t* semaphore = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 123ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 123ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
 
   uint64_t value;
   IREE_ASSERT_OK(iree_hal_semaphore_query(semaphore, &value));
@@ -39,8 +40,9 @@
 // Tests that a semaphore will accept new values as it is signaled.
 TEST_F(SemaphoreTest, NormalSignaling) {
   iree_hal_semaphore_t* semaphore = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 2ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 2ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
 
   uint64_t value;
   IREE_ASSERT_OK(iree_hal_semaphore_query(semaphore, &value));
@@ -62,8 +64,9 @@
 // Tests semaphore failure handling.
 TEST_F(SemaphoreTest, Failure) {
   iree_hal_semaphore_t* semaphore = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 2ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 2ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
 
   IREE_ASSERT_OK(iree_hal_semaphore_signal(semaphore, 3ull));
   uint64_t value;
@@ -85,35 +88,44 @@
 TEST_F(SemaphoreTest, EmptyWait) {
   IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ANY, iree_hal_semaphore_list_empty(),
-      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ALL, iree_hal_semaphore_list_empty(),
-      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
 
   IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ANY, iree_hal_semaphore_list_empty(),
-      iree_make_timeout_ns(IREE_DURATION_INFINITE)));
+      iree_make_timeout_ns(IREE_DURATION_INFINITE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ALL, iree_hal_semaphore_list_empty(),
-      iree_make_timeout_ns(IREE_DURATION_INFINITE)));
+      iree_make_timeout_ns(IREE_DURATION_INFINITE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
 }
 
 // Tests waiting on a semaphore that has already been signaled.
 TEST_F(SemaphoreTest, WaitAlreadySignaled) {
   iree_hal_semaphore_t* semaphore = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 2ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 2ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
 
   // Test both previous and current values.
   IREE_ASSERT_OK(iree_hal_semaphore_wait(
-      semaphore, 1ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      semaphore, 1ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_ASSERT_OK(iree_hal_semaphore_wait(
-      semaphore, 2ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      semaphore, 2ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
 
   IREE_ASSERT_OK(iree_hal_semaphore_wait(
-      semaphore, 1ull, iree_make_timeout_ns(IREE_DURATION_INFINITE)));
+      semaphore, 1ull, iree_make_timeout_ns(IREE_DURATION_INFINITE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_ASSERT_OK(iree_hal_semaphore_wait(
-      semaphore, 2ull, iree_make_timeout_ns(IREE_DURATION_INFINITE)));
+      semaphore, 2ull, iree_make_timeout_ns(IREE_DURATION_INFINITE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_semaphore_release(semaphore);
 }
@@ -121,14 +133,16 @@
 // Tests waiting on a semaphore that has not been signaled.
 TEST_F(SemaphoreTest, WaitUnsignaled) {
   iree_hal_semaphore_t* semaphore = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 2ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 2ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
 
   // NOTE: we don't actually block here because otherwise we'd lock up.
   // Result status is undefined - some backends may return DeadlineExceededError
   // while others may return success.
   IREE_IGNORE_ERROR(iree_hal_semaphore_wait(
-      semaphore, 3ull, iree_make_deadline(IREE_TIME_INFINITE_PAST)));
+      semaphore, 3ull, iree_make_deadline(IREE_TIME_INFINITE_PAST),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_semaphore_release(semaphore);
 }
@@ -136,8 +150,9 @@
 // Tests waiting on a semaphore that has signals past the desired value.
 TEST_F(SemaphoreTest, WaitLaterSignaledBeyond) {
   iree_hal_semaphore_t* semaphore = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 2ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 2ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
 
   std::thread thread([&]() {
     // Wait for a short period before signaling.
@@ -147,7 +162,8 @@
   });
 
   IREE_ASSERT_OK(iree_hal_semaphore_wait(
-      semaphore, 3ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      semaphore, 3ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   thread.join();
 
   iree_hal_semaphore_release(semaphore);
@@ -160,10 +176,12 @@
 TEST_F(SemaphoreTest, WaitAllButNotAllSignaled) {
   iree_hal_semaphore_t* semaphore_a = NULL;
   iree_hal_semaphore_t* semaphore_b = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore_a));
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 1ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore_b));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore_a));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 1ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore_b));
 
   iree_hal_semaphore_list_t semaphore_list;
   iree_hal_semaphore_t* semaphore_ptrs[] = {semaphore_a, semaphore_b};
@@ -177,7 +195,7 @@
   // while others may return success.
   IREE_IGNORE_ERROR(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ALL, semaphore_list,
-      iree_make_deadline(IREE_TIME_INFINITE_PAST)));
+      iree_make_deadline(IREE_TIME_INFINITE_PAST), IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_semaphore_release(semaphore_a);
   iree_hal_semaphore_release(semaphore_b);
@@ -187,10 +205,12 @@
 TEST_F(SemaphoreTest, WaitAllAndAllSignaled) {
   iree_hal_semaphore_t* semaphore_a = NULL;
   iree_hal_semaphore_t* semaphore_b = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 1ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore_a));
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 1ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore_b));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 1ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore_a));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 1ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore_b));
 
   iree_hal_semaphore_list_t semaphore_list;
   iree_hal_semaphore_t* semaphore_ptrs[] = {semaphore_a, semaphore_b};
@@ -204,7 +224,8 @@
   // while others may return success.
   IREE_IGNORE_ERROR(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ALL, semaphore_list,
-      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_semaphore_release(semaphore_a);
   iree_hal_semaphore_release(semaphore_b);
@@ -214,10 +235,12 @@
 TEST_F(SemaphoreTest, WaitAnyAlreadySignaled) {
   iree_hal_semaphore_t* semaphore_a = NULL;
   iree_hal_semaphore_t* semaphore_b = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore_a));
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 1ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore_b));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore_a));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 1ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore_b));
 
   iree_hal_semaphore_list_t semaphore_list;
   iree_hal_semaphore_t* semaphore_ptrs[] = {semaphore_a, semaphore_b};
@@ -228,7 +251,8 @@
 
   IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ANY, semaphore_list,
-      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
 
   iree_hal_semaphore_release(semaphore_a);
   iree_hal_semaphore_release(semaphore_b);
@@ -237,10 +261,12 @@
 TEST_F(SemaphoreTest, WaitAnyLaterSignaled) {
   iree_hal_semaphore_t* semaphore_a = NULL;
   iree_hal_semaphore_t* semaphore_b = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore_a));
-  IREE_ASSERT_OK(iree_hal_semaphore_create(
-      device_, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore_b));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore_a));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore_b));
 
   iree_hal_semaphore_list_t semaphore_list;
   iree_hal_semaphore_t* semaphore_ptrs[] = {semaphore_a, semaphore_b};
@@ -257,7 +283,8 @@
 
   IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ANY, semaphore_list,
-      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   thread.join();
 
   iree_hal_semaphore_release(semaphore_a);
@@ -269,22 +296,27 @@
 TEST_F(SemaphoreTest, PingPong) {
   iree_hal_semaphore_t* a2b = NULL;
   iree_hal_semaphore_t* b2a = NULL;
-  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull,
-                                           IREE_HAL_SEMAPHORE_FLAG_NONE, &a2b));
-  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull,
-                                           IREE_HAL_SEMAPHORE_FLAG_NONE, &b2a));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &a2b));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_create(device_, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &b2a));
   std::thread thread([&]() {
     // Should advance right past this because the value is already set.
     IREE_ASSERT_OK(iree_hal_semaphore_wait(
-        a2b, 0ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+        a2b, 0ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+        IREE_HAL_WAIT_FLAG_DEFAULT));
     IREE_ASSERT_OK(iree_hal_semaphore_signal(b2a, 1ull));
     // Jump ahead (blocking at first).
     IREE_ASSERT_OK(iree_hal_semaphore_wait(
-        a2b, 4ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+        a2b, 4ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+        IREE_HAL_WAIT_FLAG_DEFAULT));
   });
   // Block until thread signals.
   IREE_ASSERT_OK(iree_hal_semaphore_wait(
-      b2a, 1ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      b2a, 1ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_ASSERT_OK(iree_hal_semaphore_signal(a2b, 4ull));
   thread.join();
 
@@ -299,11 +331,13 @@
       [&]() { IREE_ASSERT_OK(iree_hal_semaphore_signal(semaphore, 1)); });
 
   IREE_ASSERT_OK(iree_hal_semaphore_wait(
-      semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore, 1);
 
   IREE_ASSERT_OK(iree_hal_semaphore_wait(
-      semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+      semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT));
   CheckSemaphoreValue(semaphore, 1);
 
   thread.join();
@@ -340,18 +374,21 @@
 
   // Immediate timeout.
   generic_test_fn([](iree_hal_semaphore_t* semaphore) {
-    return iree_hal_semaphore_wait(semaphore, 1, iree_immediate_timeout());
+    return iree_hal_semaphore_wait(semaphore, 1, iree_immediate_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT);
   });
 
   // Absolute timeout.
   generic_test_fn([](iree_hal_semaphore_t* semaphore) {
     return iree_hal_semaphore_wait(semaphore, 1,
-                                   iree_make_deadline(iree_time_now() + 1));
+                                   iree_make_deadline(iree_time_now() + 1),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT);
   });
 
   // Relative timeout.
   generic_test_fn([](iree_hal_semaphore_t* semaphore) {
-    return iree_hal_semaphore_wait(semaphore, 1, iree_make_timeout_ns(1));
+    return iree_hal_semaphore_wait(semaphore, 1, iree_make_timeout_ns(1),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT);
   });
 }
 
@@ -370,12 +407,14 @@
 
   std::thread wait_thread1([&]() {
     IREE_ASSERT_OK(iree_hal_semaphore_list_wait(
-        semaphore_list, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+        semaphore_list, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+        IREE_HAL_WAIT_FLAG_DEFAULT));
   });
 
   std::thread wait_thread2([&]() {
     IREE_ASSERT_OK(iree_hal_semaphore_list_wait(
-        semaphore_list, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+        semaphore_list, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+        IREE_HAL_WAIT_FLAG_DEFAULT));
   });
 
   std::thread signal_thread([&]() {
@@ -402,7 +441,8 @@
   iree_hal_semaphore_fail(semaphore, iree_status_clone(status));
 
   iree_status_t wait_status = iree_hal_semaphore_wait(
-      semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE));
+      semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT);
   EXPECT_EQ(iree_status_code(wait_status), IREE_STATUS_ABORTED);
   uint64_t value = 1234;
   iree_status_t query_status = iree_hal_semaphore_query(semaphore, &value);
@@ -427,7 +467,8 @@
       [&]() { iree_hal_semaphore_fail(semaphore, iree_status_clone(status)); });
 
   iree_status_t wait_status = iree_hal_semaphore_wait(
-      semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE));
+      semaphore, 1, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT);
   EXPECT_EQ(iree_status_code(wait_status), IREE_STATUS_ABORTED);
   uint64_t value = 1234;
   iree_status_t query_status = iree_hal_semaphore_query(semaphore, &value);
@@ -462,7 +503,8 @@
       payload_array,
   };
   iree_status_t wait_status = iree_hal_semaphore_list_wait(
-      semaphore_list, iree_make_deadline(IREE_TIME_INFINITE_FUTURE));
+      semaphore_list, iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT);
   EXPECT_EQ(iree_status_code(wait_status), IREE_STATUS_ABORTED);
   uint64_t value = 1234;
   iree_status_t semaphore1_query_status =
@@ -506,7 +548,8 @@
   };
   iree_status_t wait_status = iree_hal_device_wait_semaphores(
       device_, IREE_HAL_WAIT_MODE_ANY, semaphore_list,
-      iree_make_deadline(IREE_TIME_INFINITE_FUTURE));
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE),
+      IREE_HAL_WAIT_FLAG_DEFAULT);
   EXPECT_EQ(iree_status_code(wait_status), IREE_STATUS_ABORTED);
   uint64_t value = 1234;
   iree_status_t semaphore1_query_status =
diff --git a/runtime/src/iree/hal/device.c b/runtime/src/iree/hal/device.c
index 6e535ca..ad160a9 100644
--- a/runtime/src/iree/hal/device.c
+++ b/runtime/src/iree/hal/device.c
@@ -584,12 +584,13 @@
 
 IREE_API_EXPORT iree_status_t iree_hal_device_wait_semaphores(
     iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   IREE_ASSERT_ARGUMENT(device);
   if (semaphore_list.count == 0) return iree_ok_status();
   IREE_TRACE_ZONE_BEGIN(z0);
   iree_status_t status = _VTABLE_DISPATCH(device, wait_semaphores)(
-      device, wait_mode, semaphore_list, timeout);
+      device, wait_mode, semaphore_list, timeout, flags);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
diff --git a/runtime/src/iree/hal/device.h b/runtime/src/iree/hal/device.h
index e4fe7d6..2021745 100644
--- a/runtime/src/iree/hal/device.h
+++ b/runtime/src/iree/hal/device.h
@@ -472,7 +472,8 @@
 // failed and get the status.
 IREE_API_EXPORT iree_status_t iree_hal_device_wait_semaphores(
     iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout);
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags);
 
 // Begins a profile capture on |device| with the given |options|.
 // This will use an implementation-defined profiling API to capture all
@@ -590,8 +591,9 @@
       iree_hal_external_file_flags_t flags, iree_hal_file_t** out_file);
 
   iree_status_t(IREE_API_PTR* create_semaphore)(
-      iree_hal_device_t* device, uint64_t initial_value,
-      iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore);
+      iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+      uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+      iree_hal_semaphore_t** out_semaphore);
 
   iree_hal_semaphore_compatibility_t(
       IREE_API_PTR* query_semaphore_compatibility)(
@@ -673,7 +675,8 @@
 
   iree_status_t(IREE_API_PTR* wait_semaphores)(
       iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
-      const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout);
+      const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+      iree_hal_wait_flags_t flags);
 
   iree_status_t(IREE_API_PTR* profiling_begin)(
       iree_hal_device_t* device,
diff --git a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
index 3a3fe32..5a29e02 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
@@ -371,7 +371,7 @@
     status = iree_hal_amdgpu_semaphore_pool_initialize(
         &system->libhsa, &system->topology,
         IREE_HAL_AMDGPU_SEMAPHORE_POOL_DEFAULT_BLOCK_CAPACITY,
-        semaphore_options, IREE_HAL_SEMAPHORE_FLAG_NONE, host_allocator,
+        semaphore_options, IREE_HAL_SEMAPHORE_FLAG_DEFAULT, host_allocator,
         system->host_memory_pools[0].fine_pool,
         &logical_device->semaphore_pool);
   }
@@ -747,8 +747,9 @@
 }
 
 static iree_status_t iree_hal_amdgpu_logical_device_create_semaphore(
-    iree_hal_device_t* base_device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
 
@@ -1034,12 +1035,13 @@
 
 static iree_status_t iree_hal_amdgpu_logical_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
-  return iree_hal_amdgpu_wait_semaphores(&logical_device->system->libhsa,
-                                         logical_device->semaphore_pool.options,
-                                         wait_mode, semaphore_list, timeout);
+  return iree_hal_amdgpu_wait_semaphores(
+      &logical_device->system->libhsa, logical_device->semaphore_pool.options,
+      wait_mode, semaphore_list, timeout, flags);
 }
 
 static iree_status_t iree_hal_amdgpu_logical_device_profiling_begin(
diff --git a/runtime/src/iree/hal/drivers/amdgpu/semaphore.c b/runtime/src/iree/hal/drivers/amdgpu/semaphore.c
index 58a2496..c7e94ea 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/semaphore.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/semaphore.c
@@ -224,7 +224,7 @@
 
 static iree_status_t iree_hal_amdgpu_internal_semaphore_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t value,
-    iree_timeout_t timeout) {
+    iree_timeout_t timeout, iree_hal_wait_flags_t flags) {
   iree_hal_amdgpu_internal_semaphore_t* semaphore =
       iree_hal_amdgpu_internal_semaphore_cast(base_semaphore);
   iree_hal_semaphore_list_t semaphore_list = {
@@ -234,7 +234,7 @@
   };
   return iree_hal_amdgpu_wait_semaphores(semaphore->libhsa, semaphore->options,
                                          IREE_HAL_WAIT_MODE_ALL, semaphore_list,
-                                         timeout);
+                                         timeout, flags);
 }
 
 static const iree_hal_semaphore_vtable_t
@@ -345,7 +345,8 @@
 iree_status_t iree_hal_amdgpu_wait_semaphores(
     const iree_hal_amdgpu_libhsa_t* libhsa,
     iree_hal_amdgpu_semaphore_options_t options, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   IREE_ASSERT_ARGUMENT(libhsa);
   if (semaphore_list.count == 0) return iree_ok_status();  // no-op
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/drivers/amdgpu/semaphore.h b/runtime/src/iree/hal/drivers/amdgpu/semaphore.h
index ea52561..e7c59cf 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/semaphore.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/semaphore.h
@@ -145,7 +145,8 @@
 iree_status_t iree_hal_amdgpu_wait_semaphores(
     const iree_hal_amdgpu_libhsa_t* libhsa,
     iree_hal_amdgpu_semaphore_options_t options, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout);
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/runtime/src/iree/hal/drivers/amdgpu/semaphore_pool_test.cc b/runtime/src/iree/hal/drivers/amdgpu/semaphore_pool_test.cc
index b621a31..5408494 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/semaphore_pool_test.cc
+++ b/runtime/src/iree/hal/drivers/amdgpu/semaphore_pool_test.cc
@@ -70,7 +70,7 @@
   iree_hal_amdgpu_semaphore_pool_t semaphore_pool = {0};
   IREE_ASSERT_OK(iree_hal_amdgpu_semaphore_pool_initialize(
       &libhsa, &topology, IREE_HAL_AMDGPU_SEMAPHORE_POOL_DEFAULT_BLOCK_CAPACITY,
-      options, IREE_HAL_SEMAPHORE_FLAG_NONE, host_allocator, cpu_memory_pool,
+      options, IREE_HAL_SEMAPHORE_FLAG_DEFAULT, host_allocator, cpu_memory_pool,
       &semaphore_pool));
 
   // No-op since nothing has been allocated.
@@ -90,7 +90,7 @@
   iree_hal_amdgpu_semaphore_pool_t semaphore_pool = {0};
   IREE_ASSERT_OK(iree_hal_amdgpu_semaphore_pool_initialize(
       &libhsa, &topology,
-      /*block_capacity=*/32, options, IREE_HAL_SEMAPHORE_FLAG_NONE,
+      /*block_capacity=*/32, options, IREE_HAL_SEMAPHORE_FLAG_DEFAULT,
       host_allocator, cpu_memory_pool, &semaphore_pool));
 
   // No-op since nothing has been allocated yet.
@@ -127,14 +127,14 @@
   iree_hal_amdgpu_semaphore_pool_t semaphore_pool = {0};
   IREE_ASSERT_OK(iree_hal_amdgpu_semaphore_pool_initialize(
       &libhsa, &topology,
-      /*block_capacity=*/32, options, IREE_HAL_SEMAPHORE_FLAG_NONE,
+      /*block_capacity=*/32, options, IREE_HAL_SEMAPHORE_FLAG_DEFAULT,
       host_allocator, cpu_memory_pool, &semaphore_pool));
 
   // Acquire a semaphore.
   const uint64_t initial_value = 123ull;
   iree_hal_semaphore_t* semaphore = NULL;
   IREE_ASSERT_OK(iree_hal_amdgpu_semaphore_pool_acquire(
-      &semaphore_pool, initial_value, IREE_HAL_SEMAPHORE_FLAG_NONE,
+      &semaphore_pool, initial_value, IREE_HAL_SEMAPHORE_FLAG_DEFAULT,
       &semaphore));
   ASSERT_NE(semaphore, nullptr);
 
@@ -167,7 +167,7 @@
   iree_hal_amdgpu_semaphore_pool_t semaphore_pool = {0};
   IREE_ASSERT_OK(iree_hal_amdgpu_semaphore_pool_initialize(
       &libhsa, &topology, /*block_capacity=*/32, options,
-      IREE_HAL_SEMAPHORE_FLAG_NONE, host_allocator, cpu_memory_pool,
+      IREE_HAL_SEMAPHORE_FLAG_DEFAULT, host_allocator, cpu_memory_pool,
       &semaphore_pool));
   // NOTE: the capacity may be larger than requested due to alignment.
   const iree_host_size_t block_capacity = semaphore_pool.block_capacity;
@@ -181,15 +181,15 @@
   // Allocate enough to consume the entire first block.
   for (iree_host_size_t i = 0; i < block_capacity; ++i) {
     IREE_ASSERT_OK(iree_hal_amdgpu_semaphore_pool_acquire(
-        &semaphore_pool, /*initial_value=*/0ull, IREE_HAL_SEMAPHORE_FLAG_NONE,
-        &semaphores[i]));
+        &semaphore_pool, /*initial_value=*/0ull,
+        IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphores[i]));
     ASSERT_NE(semaphores[i], nullptr);
   }
 
   // Allocate +1 to trigger growth and acquire the next block.
   iree_hal_semaphore_t* growth_semaphore = NULL;
   IREE_ASSERT_OK(iree_hal_amdgpu_semaphore_pool_acquire(
-      &semaphore_pool, /*initial_value=*/0ull, IREE_HAL_SEMAPHORE_FLAG_NONE,
+      &semaphore_pool, /*initial_value=*/0ull, IREE_HAL_SEMAPHORE_FLAG_DEFAULT,
       &growth_semaphore));
   ASSERT_NE(growth_semaphore, nullptr);
 
diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
index 9e5ae7b..4459030 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
@@ -902,8 +902,9 @@
 }
 
 static iree_status_t iree_hal_cuda_device_create_semaphore(
-    iree_hal_device_t* base_device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
   return iree_hal_cuda_event_semaphore_create(
       initial_value, device->cuda_symbols, device->timepoint_pool,
@@ -933,8 +934,9 @@
   // NOTE: block on the semaphores here; we could avoid this by properly
   // sequencing device work with semaphores. The CUDA HAL is not currently
   // asynchronous.
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
-                                                    iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT));
 
   // Allocate from the pool; likely to fail in cases of virtual memory
   // exhaustion but the error may be deferred until a later synchronization.
@@ -975,8 +977,9 @@
   // NOTE: block on the semaphores here; we could avoid this by properly
   // sequencing device work with semaphores. The CUDA HAL is not currently
   // asynchronous.
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
-                                                    iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT));
 
   // Schedule the buffer deallocation if we got it from a pool and otherwise
   // drop it on the floor and let it be freed when the buffer is released.
@@ -1078,10 +1081,11 @@
 
 static iree_status_t iree_hal_cuda_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
   return iree_hal_cuda_semaphore_multi_wait(semaphore_list, wait_mode, timeout,
-                                            &device->block_pool);
+                                            flags, &device->block_pool);
 }
 
 static iree_status_t iree_hal_cuda_device_profiling_begin(
diff --git a/runtime/src/iree/hal/drivers/cuda/event_semaphore.c b/runtime/src/iree/hal/drivers/cuda/event_semaphore.c
index e5899a2..e17c69c 100644
--- a/runtime/src/iree/hal/drivers/cuda/event_semaphore.c
+++ b/runtime/src/iree/hal/drivers/cuda/event_semaphore.c
@@ -308,7 +308,7 @@
 
 static iree_status_t iree_hal_cuda_semaphore_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t value,
-    iree_timeout_t timeout) {
+    iree_timeout_t timeout, iree_hal_wait_flags_t flags) {
   iree_hal_cuda_semaphore_t* semaphore =
       iree_hal_cuda_semaphore_cast(base_semaphore);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -362,13 +362,14 @@
 iree_status_t iree_hal_cuda_semaphore_multi_wait(
     const iree_hal_semaphore_list_t semaphore_list,
     iree_hal_wait_mode_t wait_mode, iree_timeout_t timeout,
-    iree_arena_block_pool_t* block_pool) {
+    iree_hal_wait_flags_t flags, iree_arena_block_pool_t* block_pool) {
   if (semaphore_list.count == 0) return iree_ok_status();
 
   if (semaphore_list.count == 1) {
     // Fast-path for a single semaphore.
     return iree_hal_semaphore_wait(semaphore_list.semaphores[0],
-                                   semaphore_list.payload_values[0], timeout);
+                                   semaphore_list.payload_values[0], timeout,
+                                   flags);
   }
 
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/drivers/cuda/event_semaphore.h b/runtime/src/iree/hal/drivers/cuda/event_semaphore.h
index e67d55f..9b10d0b 100644
--- a/runtime/src/iree/hal/drivers/cuda/event_semaphore.h
+++ b/runtime/src/iree/hal/drivers/cuda/event_semaphore.h
@@ -56,7 +56,7 @@
 iree_status_t iree_hal_cuda_semaphore_multi_wait(
     const iree_hal_semaphore_list_t semaphore_list,
     iree_hal_wait_mode_t wait_mode, iree_timeout_t timeout,
-    iree_arena_block_pool_t* block_pool);
+    iree_hal_wait_flags_t flags, iree_arena_block_pool_t* block_pool);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/runtime/src/iree/hal/drivers/hip/event_semaphore.c b/runtime/src/iree/hal/drivers/hip/event_semaphore.c
index a2c2028..277c9e6 100644
--- a/runtime/src/iree/hal/drivers/hip/event_semaphore.c
+++ b/runtime/src/iree/hal/drivers/hip/event_semaphore.c
@@ -282,7 +282,7 @@
 iree_status_t iree_hal_hip_semaphore_multi_wait(
     const iree_hal_semaphore_list_t semaphore_list,
     iree_hal_wait_mode_t wait_mode, iree_timeout_t timeout,
-    iree_allocator_t host_allocator) {
+    iree_hal_wait_flags_t flags, iree_allocator_t host_allocator) {
   if (semaphore_list.count == 0) return iree_ok_status();
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -296,8 +296,9 @@
     for (iree_host_size_t i = 0; i < semaphore_list.count; ++i) {
       iree_timeout_t t = iree_make_deadline(deadline_ns);
       status = iree_status_join(
-          status, iree_hal_semaphore_wait(semaphore_list.semaphores[0],
-                                          semaphore_list.payload_values[0], t));
+          status,
+          iree_hal_semaphore_wait(semaphore_list.semaphores[0],
+                                  semaphore_list.payload_values[0], t, flags));
       if (!iree_status_is_ok(status)) {
         break;
       }
@@ -941,7 +942,7 @@
 
 static iree_status_t iree_hal_hip_semaphore_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t value,
-    iree_timeout_t timeout) {
+    iree_timeout_t timeout, iree_hal_wait_flags_t flags) {
   iree_hal_hip_semaphore_t* semaphore =
       iree_hal_hip_semaphore_cast(base_semaphore);
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/drivers/hip/event_semaphore.h b/runtime/src/iree/hal/drivers/hip/event_semaphore.h
index b5dec88..d2accae 100644
--- a/runtime/src/iree/hal/drivers/hip/event_semaphore.h
+++ b/runtime/src/iree/hal/drivers/hip/event_semaphore.h
@@ -37,7 +37,7 @@
 iree_status_t iree_hal_hip_semaphore_multi_wait(
     const iree_hal_semaphore_list_t semaphore_list,
     iree_hal_wait_mode_t wait_mode, iree_timeout_t timeout,
-    iree_allocator_t host_allocator);
+    iree_hal_wait_flags_t flags, iree_allocator_t host_allocator);
 
 // Adds a work item to be executed once we have a forward progress
 // guarantee on this semaphore to reach a particular value.
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index 3f8b259..4c26a28 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -986,8 +986,9 @@
 }
 
 static iree_status_t iree_hal_hip_device_create_semaphore(
-    iree_hal_device_t* base_device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   iree_hal_hip_device_t* device = iree_hal_hip_device_cast(base_device);
   return iree_hal_hip_event_semaphore_create(
       initial_value, device->hip_symbols, device->host_allocator,
@@ -1699,8 +1700,9 @@
   // sequencing device work with semaphores. The HIP HAL is not currently
   // asynchronous.
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_semaphore_list_wait(wait_semaphore_list,
-                                       iree_infinite_timeout()));
+      z0,
+      iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT));
 
   status =
       iree_hal_allocator_allocate_buffer(iree_hal_device_allocator(base_device),
@@ -1800,8 +1802,9 @@
   // sequencing device work with semaphores. The HIP HAL is not currently
   // asynchronous.
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_semaphore_list_wait(wait_semaphore_list,
-                                       iree_infinite_timeout()));
+      z0,
+      iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT));
 
   // Schedule the buffer deallocation if we got it from a pool and otherwise
   // drop it on the floor and let it be freed when the buffer is released.
@@ -2648,10 +2651,11 @@
 
 static iree_status_t iree_hal_hip_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   iree_hal_hip_device_t* device = iree_hal_hip_device_cast(base_device);
   return iree_hal_hip_semaphore_multi_wait(semaphore_list, wait_mode, timeout,
-                                           device->host_allocator);
+                                           flags, device->host_allocator);
 }
 
 static iree_status_t iree_hal_hip_device_profiling_begin(
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_device.c b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
index ed9924a..7f4f211 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_device.c
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
@@ -274,8 +274,9 @@
 }
 
 static iree_status_t iree_hal_sync_device_create_semaphore(
-    iree_hal_device_t* base_device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
   return iree_hal_sync_semaphore_create(&device->semaphore_state, initial_value,
                                         device->host_allocator, out_semaphore);
@@ -296,8 +297,9 @@
     iree_device_size_t allocation_size, iree_hal_alloca_flags_t flags,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   // TODO(benvanik): queue-ordered allocations.
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
-                                                    iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_RETURN_IF_ERROR(
       iree_hal_allocator_allocate_buffer(iree_hal_device_allocator(base_device),
                                          params, allocation_size, out_buffer));
@@ -427,7 +429,7 @@
   // Wait for semaphores to be signaled before performing any work.
   IREE_RETURN_IF_ERROR(iree_hal_sync_semaphore_multi_wait(
       &device->semaphore_state, IREE_HAL_WAIT_MODE_ALL, wait_semaphore_list,
-      iree_infinite_timeout()));
+      iree_infinite_timeout(), IREE_HAL_WAIT_FLAG_DEFAULT));
 
   // Run all deferred command buffers - any we could have run inline we already
   // did during recording.
@@ -449,10 +451,11 @@
 
 static iree_status_t iree_hal_sync_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
   return iree_hal_sync_semaphore_multi_wait(&device->semaphore_state, wait_mode,
-                                            semaphore_list, timeout);
+                                            semaphore_list, timeout, flags);
 }
 
 static iree_status_t iree_hal_sync_device_profiling_begin(
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_semaphore.c b/runtime/src/iree/hal/drivers/local_sync/sync_semaphore.c
index 44d0e67..dacd4d5 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_semaphore.c
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_semaphore.c
@@ -262,7 +262,7 @@
 
 static iree_status_t iree_hal_sync_semaphore_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t value,
-    iree_timeout_t timeout) {
+    iree_timeout_t timeout, iree_hal_wait_flags_t flags) {
   iree_hal_sync_semaphore_t* semaphore =
       iree_hal_sync_semaphore_cast(base_semaphore);
 
@@ -395,13 +395,15 @@
 iree_status_t iree_hal_sync_semaphore_multi_wait(
     iree_hal_sync_semaphore_state_t* shared_state,
     iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   if (semaphore_list.count == 0) {
     return iree_ok_status();
   } else if (semaphore_list.count == 1) {
     // Fast-path for a single semaphore.
     return iree_hal_semaphore_wait(semaphore_list.semaphores[0],
-                                   semaphore_list.payload_values[0], timeout);
+                                   semaphore_list.payload_values[0], timeout,
+                                   flags);
   }
 
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_semaphore.h b/runtime/src/iree/hal/drivers/local_sync/sync_semaphore.h
index 6d8db35..1286570 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_semaphore.h
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_semaphore.h
@@ -65,7 +65,8 @@
 iree_status_t iree_hal_sync_semaphore_multi_wait(
     iree_hal_sync_semaphore_state_t* shared_state,
     iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout);
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/runtime/src/iree/hal/drivers/local_task/task_device.c b/runtime/src/iree/hal/drivers/local_task/task_device.c
index 577a7d2..45e4c8e 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_device.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_device.c
@@ -352,8 +352,9 @@
 }
 
 static iree_status_t iree_hal_task_device_create_semaphore(
-    iree_hal_device_t* base_device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
   return iree_hal_task_semaphore_create(
       iree_hal_task_device_shared_event_pool(device), initial_value,
@@ -383,8 +384,9 @@
     iree_device_size_t allocation_size, iree_hal_alloca_flags_t flags,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   // TODO(benvanik): queue-ordered allocations.
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
-                                                    iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_RETURN_IF_ERROR(
       iree_hal_allocator_allocate_buffer(iree_hal_device_allocator(base_device),
                                          params, allocation_size, out_buffer));
@@ -481,10 +483,11 @@
 
 static iree_status_t iree_hal_task_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
   return iree_hal_task_semaphore_multi_wait(
-      wait_mode, semaphore_list, timeout,
+      wait_mode, semaphore_list, timeout, flags,
       iree_hal_task_device_shared_event_pool(device),
       &device->large_block_pool);
 }
diff --git a/runtime/src/iree/hal/drivers/local_task/task_semaphore.c b/runtime/src/iree/hal/drivers/local_task/task_semaphore.c
index 73e7a42..62cd17d 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_semaphore.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_semaphore.c
@@ -279,7 +279,7 @@
 
 static iree_status_t iree_hal_task_semaphore_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t value,
-    iree_timeout_t timeout) {
+    iree_timeout_t timeout, iree_hal_wait_flags_t flags) {
   iree_hal_task_semaphore_t* semaphore =
       iree_hal_task_semaphore_cast(base_semaphore);
 
@@ -324,13 +324,15 @@
 iree_status_t iree_hal_task_semaphore_multi_wait(
     iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
-    iree_event_pool_t* event_pool, iree_arena_block_pool_t* block_pool) {
+    iree_hal_wait_flags_t flags, iree_event_pool_t* event_pool,
+    iree_arena_block_pool_t* block_pool) {
   if (semaphore_list.count == 0) {
     return iree_ok_status();
   } else if (semaphore_list.count == 1) {
     // Fast-path for a single semaphore.
     return iree_hal_semaphore_wait(semaphore_list.semaphores[0],
-                                   semaphore_list.payload_values[0], timeout);
+                                   semaphore_list.payload_values[0], timeout,
+                                   flags);
   }
 
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/drivers/local_task/task_semaphore.h b/runtime/src/iree/hal/drivers/local_task/task_semaphore.h
index 79fc9fc..70167c3 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_semaphore.h
+++ b/runtime/src/iree/hal/drivers/local_task/task_semaphore.h
@@ -45,7 +45,8 @@
 iree_status_t iree_hal_task_semaphore_multi_wait(
     iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
-    iree_event_pool_t* event_pool, iree_arena_block_pool_t* block_pool);
+    iree_hal_wait_flags_t flags, iree_event_pool_t* event_pool,
+    iree_arena_block_pool_t* block_pool);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/runtime/src/iree/hal/drivers/metal/metal_device.m b/runtime/src/iree/hal/drivers/metal/metal_device.m
index b621ac1..33028dd 100644
--- a/runtime/src/iree/hal/drivers/metal/metal_device.m
+++ b/runtime/src/iree/hal/drivers/metal/metal_device.m
@@ -292,10 +292,10 @@
                                    handle, iree_hal_device_host_allocator(base_device), out_file);
 }
 
-static iree_status_t iree_hal_metal_device_create_semaphore(iree_hal_device_t* base_device,
-                                                            uint64_t initial_value,
-                                                            iree_hal_semaphore_flags_t flags,
-                                                            iree_hal_semaphore_t** out_semaphore) {
+static iree_status_t iree_hal_metal_device_create_semaphore(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   iree_hal_metal_device_t* device = iree_hal_metal_device_cast(base_device);
   return iree_hal_metal_shared_event_create(device->device, initial_value, device->event_listener,
                                             device->host_allocator, out_semaphore);
@@ -321,7 +321,8 @@
     iree_hal_buffer_params_t params, iree_device_size_t allocation_size,
     iree_hal_alloca_flags_t flags, iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   // TODO(benvanik): queue-ordered allocations.
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                                    IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer(iree_hal_device_allocator(base_device),
                                                           params, allocation_size, out_buffer));
   IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list));
@@ -525,8 +526,9 @@
 
 static iree_status_t iree_hal_metal_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
-  return iree_hal_metal_shared_event_multi_wait(wait_mode, &semaphore_list, timeout);
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
+  return iree_hal_metal_shared_event_multi_wait(wait_mode, &semaphore_list, timeout, flags);
 }
 
 static iree_status_t iree_hal_metal_device_profiling_begin(
diff --git a/runtime/src/iree/hal/drivers/metal/shared_event.h b/runtime/src/iree/hal/drivers/metal/shared_event.h
index 4e3b827..24e9af6 100644
--- a/runtime/src/iree/hal/drivers/metal/shared_event.h
+++ b/runtime/src/iree/hal/drivers/metal/shared_event.h
@@ -39,7 +39,8 @@
 // |wait_mode| before |timeout|.
 iree_status_t iree_hal_metal_shared_event_multi_wait(
     iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout);
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/runtime/src/iree/hal/drivers/metal/shared_event.m b/runtime/src/iree/hal/drivers/metal/shared_event.m
index 613954b..25800a1 100644
--- a/runtime/src/iree/hal/drivers/metal/shared_event.m
+++ b/runtime/src/iree/hal/drivers/metal/shared_event.m
@@ -135,7 +135,8 @@
 }
 
 static iree_status_t iree_hal_metal_shared_event_wait(iree_hal_semaphore_t* base_semaphore,
-                                                      uint64_t value, iree_timeout_t timeout) {
+                                                      uint64_t value, iree_timeout_t timeout,
+                                                      iree_hal_wait_flags_t flags) {
   iree_hal_metal_shared_event_t* semaphore = iree_hal_metal_shared_event_cast(base_semaphore);
 
   iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
@@ -200,12 +201,12 @@
 
 iree_status_t iree_hal_metal_shared_event_multi_wait(
     iree_hal_wait_mode_t wait_mode, const iree_hal_semaphore_list_t* semaphore_list,
-    iree_timeout_t timeout) {
+    iree_timeout_t timeout, iree_hal_wait_flags_t flags) {
   if (semaphore_list->count == 0) return iree_ok_status();
   // If there is only one semaphore, just wait on it.
   if (semaphore_list->count == 1) {
     return iree_hal_metal_shared_event_wait(semaphore_list->semaphores[0],
-                                            semaphore_list->payload_values[0], timeout);
+                                            semaphore_list->payload_values[0], timeout, flags);
   }
 
   iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
diff --git a/runtime/src/iree/hal/drivers/null/device.c b/runtime/src/iree/hal/drivers/null/device.c
index 3cdcc8e..0d9cc72 100644
--- a/runtime/src/iree/hal/drivers/null/device.c
+++ b/runtime/src/iree/hal/drivers/null/device.c
@@ -283,15 +283,16 @@
 }
 
 static iree_status_t iree_hal_null_device_create_semaphore(
-    iree_hal_device_t* base_device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   iree_hal_null_device_t* device = iree_hal_null_device_cast(base_device);
 
   // TODO(null): pass any additional resources required to create or track the
   // semaphore. The implementation could pool semaphores here.
   (void)device;
 
-  return iree_hal_null_semaphore_create(initial_value, flags,
+  return iree_hal_null_semaphore_create(queue_affinity, initial_value, flags,
                                         device->host_allocator, out_semaphore);
 }
 
@@ -522,7 +523,8 @@
 
 static iree_status_t iree_hal_null_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   iree_hal_null_device_t* device = iree_hal_null_device_cast(base_device);
 
   // TODO(null): implement multi-wait as either an ALL (AND) or ANY (OR)
diff --git a/runtime/src/iree/hal/drivers/null/semaphore.c b/runtime/src/iree/hal/drivers/null/semaphore.c
index 65e3ca3..d3e972e 100644
--- a/runtime/src/iree/hal/drivers/null/semaphore.c
+++ b/runtime/src/iree/hal/drivers/null/semaphore.c
@@ -26,8 +26,9 @@
 }
 
 iree_status_t iree_hal_null_semaphore_create(
-    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
-    iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_queue_affinity_t queue_affinity, uint64_t initial_value,
+    iree_hal_semaphore_flags_t flags, iree_allocator_t host_allocator,
+    iree_hal_semaphore_t** out_semaphore) {
   IREE_ASSERT_ARGUMENT(out_semaphore);
   IREE_TRACE_ZONE_BEGIN(z0);
   *out_semaphore = NULL;
@@ -43,6 +44,15 @@
   // TODO(null): implement semaphores. Note that there is some basic support
   // provided for timepoints as part of iree/hal/utils/semaphore_base.h but the
   // actual synchronization aspects are handled by the implementation.
+  //
+  // If the DEVICE_LOCAL flag and a |queue_affinity| is assigned (and not just
+  // IREE_HAL_QUEUE_AFFINITY_ANY) then the implementation can assume that it is
+  // only used on that set of queues (never waited/signaled from anywhere else).
+  // If DEVICE_LOCAL is not set then other devices may signal or wait.
+  //
+  // If the IREE_HAL_SEMAPHORE_FLAG_HOST_INTERRUPT flag is not set then waits
+  // from the host are allowed to spin instead of performing optimized platform
+  // blocking (via interrupt mechanisms).
   iree_status_t status =
       iree_make_status(IREE_STATUS_UNIMPLEMENTED, "semaphore not implemented");
 
@@ -137,7 +147,7 @@
 
 static iree_status_t iree_hal_null_semaphore_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t value,
-    iree_timeout_t timeout) {
+    iree_timeout_t timeout, iree_hal_wait_flags_t flags) {
   iree_hal_null_semaphore_t* semaphore =
       iree_hal_null_semaphore_cast(base_semaphore);
 
diff --git a/runtime/src/iree/hal/drivers/null/semaphore.h b/runtime/src/iree/hal/drivers/null/semaphore.h
index b27d361..94ebbd6 100644
--- a/runtime/src/iree/hal/drivers/null/semaphore.h
+++ b/runtime/src/iree/hal/drivers/null/semaphore.h
@@ -17,7 +17,8 @@
 // Creates a {Null} semaphore used for ordering queue operations and
 // synchronizing between host/device and device/device.
 iree_status_t iree_hal_null_semaphore_create(
-    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
-    iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore);
+    iree_hal_queue_affinity_t queue_affinity, uint64_t initial_value,
+    iree_hal_semaphore_flags_t flags, iree_allocator_t host_allocator,
+    iree_hal_semaphore_t** out_semaphore);
 
 #endif  // IREE_HAL_DRIVERS_NULL_SEMAPHORE_H_
diff --git a/runtime/src/iree/hal/drivers/vulkan/native_semaphore.cc b/runtime/src/iree/hal/drivers/vulkan/native_semaphore.cc
index a0e3000..20ac806 100644
--- a/runtime/src/iree/hal/drivers/vulkan/native_semaphore.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/native_semaphore.cc
@@ -203,7 +203,7 @@
 iree_status_t iree_hal_vulkan_native_semaphore_multi_wait(
     iree::hal::vulkan::VkDeviceHandle* logical_device,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
-    VkSemaphoreWaitFlags wait_flags) {
+    iree_hal_wait_flags_t flags, VkSemaphoreWaitFlags wait_flags) {
   if (semaphore_list->count == 0) return iree_ok_status();
 
   iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
@@ -273,7 +273,7 @@
 
 static iree_status_t iree_hal_vulkan_native_semaphore_wait(
     iree_hal_semaphore_t* base_semaphore, uint64_t value,
-    iree_timeout_t timeout) {
+    iree_timeout_t timeout, iree_hal_wait_flags_t flags) {
   iree_hal_vulkan_native_semaphore_t* semaphore =
       iree_hal_vulkan_native_semaphore_cast(base_semaphore);
   iree_hal_semaphore_list_t semaphore_list = {
@@ -282,7 +282,7 @@
       /*.payload_values=*/&value,
   };
   return iree_hal_vulkan_native_semaphore_multi_wait(
-      semaphore->logical_device, &semaphore_list, timeout, 0);
+      semaphore->logical_device, &semaphore_list, timeout, flags, 0);
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_vulkan_semaphore_handle(
diff --git a/runtime/src/iree/hal/drivers/vulkan/native_semaphore.h b/runtime/src/iree/hal/drivers/vulkan/native_semaphore.h
index 4f98cd2..30c4893 100644
--- a/runtime/src/iree/hal/drivers/vulkan/native_semaphore.h
+++ b/runtime/src/iree/hal/drivers/vulkan/native_semaphore.h
@@ -38,7 +38,7 @@
 iree_status_t iree_hal_vulkan_native_semaphore_multi_wait(
     iree::hal::vulkan::VkDeviceHandle* logical_device,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
-    VkSemaphoreWaitFlags wait_flags);
+    iree_hal_wait_flags_t flags, VkSemaphoreWaitFlags wait_flags);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
index e74bb71..b9a42f7 100644
--- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
@@ -1605,8 +1605,9 @@
 }
 
 static iree_status_t iree_hal_vulkan_device_create_semaphore(
-    iree_hal_device_t* base_device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
   return iree_hal_vulkan_native_semaphore_create(device->logical_device,
                                                  initial_value, out_semaphore);
@@ -1634,8 +1635,9 @@
     iree_device_size_t allocation_size, iree_hal_alloca_flags_t flags,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   // TODO(benvanik): queue-ordered allocations.
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
-                                                    iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout(),
+                                   IREE_HAL_WAIT_FLAG_DEFAULT));
   IREE_RETURN_IF_ERROR(
       iree_hal_allocator_allocate_buffer(iree_hal_device_allocator(base_device),
                                          params, allocation_size, out_buffer));
@@ -1755,7 +1757,8 @@
   // HACK: we don't track async resource lifetimes so we have to block.
   if (iree_status_is_ok(status)) {
     status = iree_hal_semaphore_list_wait(signal_semaphore_list,
-                                          iree_infinite_timeout());
+                                          iree_infinite_timeout(),
+                                          IREE_HAL_WAIT_FLAG_DEFAULT);
   }
 
   // TODO(indirect-cmd): when async these need to be retained until the
@@ -1773,14 +1776,15 @@
 
 static iree_status_t iree_hal_vulkan_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
-    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
   VkSemaphoreWaitFlags wait_flags = 0;
   if (wait_mode == IREE_HAL_WAIT_MODE_ANY) {
     wait_flags |= VK_SEMAPHORE_WAIT_ANY_BIT;
   }
   return iree_hal_vulkan_native_semaphore_multi_wait(
-      device->logical_device, &semaphore_list, timeout, wait_flags);
+      device->logical_device, &semaphore_list, timeout, flags, wait_flags);
 }
 
 static iree_status_t iree_hal_vulkan_device_profiling_begin(
diff --git a/runtime/src/iree/hal/fence.c b/runtime/src/iree/hal/fence.c
index a65adbd..2add0c7 100644
--- a/runtime/src/iree/hal/fence.c
+++ b/runtime/src/iree/hal/fence.c
@@ -245,11 +245,12 @@
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_fence_wait(iree_hal_fence_t* fence,
-                                                  iree_timeout_t timeout) {
+                                                  iree_timeout_t timeout,
+                                                  iree_hal_wait_flags_t flags) {
   if (!fence || !fence->count) return iree_ok_status();
   IREE_TRACE_ZONE_BEGIN(z0);
   iree_status_t status = iree_hal_semaphore_list_wait(
-      iree_hal_fence_semaphore_list(fence), timeout);
+      iree_hal_fence_semaphore_list(fence), timeout, flags);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
@@ -274,7 +275,7 @@
     case IREE_WAIT_SOURCE_COMMAND_WAIT_ONE: {
       const iree_timeout_t timeout =
           ((const iree_wait_source_wait_params_t*)params)->timeout;
-      return iree_hal_fence_wait(fence, timeout);
+      return iree_hal_fence_wait(fence, timeout, IREE_HAL_WAIT_FLAG_DEFAULT);
     }
     case IREE_WAIT_SOURCE_COMMAND_EXPORT: {
       const iree_wait_primitive_type_t target_type =
diff --git a/runtime/src/iree/hal/fence.h b/runtime/src/iree/hal/fence.h
index 1c33c9a..f7d7170 100644
--- a/runtime/src/iree/hal/fence.h
+++ b/runtime/src/iree/hal/fence.h
@@ -121,7 +121,8 @@
 // used to perform a join that will propagate failures from any semaphore used
 // in timepoints.
 IREE_API_EXPORT iree_status_t iree_hal_fence_wait(iree_hal_fence_t* fence,
-                                                  iree_timeout_t timeout);
+                                                  iree_timeout_t timeout,
+                                                  iree_hal_wait_flags_t flags);
 
 // Returns a wait source reference to |fence| after it reaches or exceeds
 // all defined timepoints.
diff --git a/runtime/src/iree/hal/semaphore.c b/runtime/src/iree/hal/semaphore.c
index 9c9be64..58da6ac 100644
--- a/runtime/src/iree/hal/semaphore.c
+++ b/runtime/src/iree/hal/semaphore.c
@@ -52,8 +52,9 @@
 IREE_HAL_API_RETAIN_RELEASE(semaphore);
 
 IREE_API_EXPORT iree_status_t iree_hal_semaphore_create(
-    iree_hal_device_t* device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) {
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore) {
   IREE_ASSERT_ARGUMENT(device);
   IREE_ASSERT_ARGUMENT(out_semaphore);
   *out_semaphore = NULL;
@@ -61,7 +62,7 @@
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, initial_value);
   iree_status_t status =
       IREE_HAL_VTABLE_DISPATCH(device, iree_hal_device, create_semaphore)(
-          device, initial_value, flags, out_semaphore);
+          device, queue_affinity, initial_value, flags, out_semaphore);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
@@ -99,13 +100,14 @@
   IREE_TRACE_ZONE_END(z0);
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_semaphore_wait(
-    iree_hal_semaphore_t* semaphore, uint64_t value, iree_timeout_t timeout) {
+IREE_API_EXPORT iree_status_t
+iree_hal_semaphore_wait(iree_hal_semaphore_t* semaphore, uint64_t value,
+                        iree_timeout_t timeout, iree_hal_wait_flags_t flags) {
   IREE_ASSERT_ARGUMENT(semaphore);
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, value);
   iree_status_t status =
-      _VTABLE_DISPATCH(semaphore, wait)(semaphore, value, timeout);
+      _VTABLE_DISPATCH(semaphore, wait)(semaphore, value, timeout, flags);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
@@ -134,7 +136,8 @@
     case IREE_WAIT_SOURCE_COMMAND_WAIT_ONE: {
       const iree_timeout_t timeout =
           ((const iree_wait_source_wait_params_t*)params)->timeout;
-      return iree_hal_semaphore_wait(semaphore, target_value, timeout);
+      return iree_hal_semaphore_wait(semaphore, target_value, timeout,
+                                     IREE_HAL_WAIT_FLAG_DEFAULT);
     }
     case IREE_WAIT_SOURCE_COMMAND_EXPORT: {
       const iree_wait_primitive_type_t target_type =
@@ -241,7 +244,8 @@
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_semaphore_list_wait(
-    iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+    iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags) {
   if (!semaphore_list.count) return iree_ok_status();
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -259,7 +263,8 @@
   iree_status_t status = iree_ok_status();
   for (iree_host_size_t i = 0; i < semaphore_list.count; ++i) {
     status = iree_hal_semaphore_wait(semaphore_list.semaphores[i],
-                                     semaphore_list.payload_values[i], timeout);
+                                     semaphore_list.payload_values[i], timeout,
+                                     flags);
     if (!iree_status_is_ok(status)) break;
   }
 
diff --git a/runtime/src/iree/hal/semaphore.h b/runtime/src/iree/hal/semaphore.h
index bccadf1..1392762 100644
--- a/runtime/src/iree/hal/semaphore.h
+++ b/runtime/src/iree/hal/semaphore.h
@@ -26,9 +26,53 @@
 
 // A bitmask of flags controlling the behavior of a semaphore.
 enum iree_hal_semaphore_flag_bits_t {
-  IREE_HAL_SEMAPHORE_FLAG_NONE = 0u,
+  IREE_HAL_SEMAPHORE_FLAG_NONE = 0ull,
+
+  // Semaphore is only ever used on the same HAL device it was created on.
+  // Attempting to use the semaphore on another device even if provided by the
+  // same HAL driver will result in undefined behavior. If a specific queue
+  // affinity was provided during creation the semaphore may only be used on
+  // those queues.
+  IREE_HAL_SEMAPHORE_FLAG_DEVICE_LOCAL = 1ull << 0,
+
+  // Semaphore will be used as part of a blocking host wait operation and should
+  // support interrupts. Without this flag set host waits may spin instead of
+  // using platform waits and interrupts to reduce power consumption and CPU
+  // contention.
+  IREE_HAL_SEMAPHORE_FLAG_HOST_INTERRUPT = 1ull << 2,
+
+  // Semaphore object can be exported using iree_hal_semaphore_export. Only
+  // semaphore implementations that natively support timeline semantics can be
+  // exported like this. This is just a hint that export should be supported:
+  // export may still fail.
+  IREE_HAL_SEMAPHORE_FLAG_EXPORTABLE = 1ull << 3,
+
+  // Timepoints can be exported using iree_hal_semaphore_export_timepoint. This
+  // may require significant internal tracking and should only be used when
+  // interoperating with other APIs that do not natively support timeline
+  // semaphores.
+  IREE_HAL_SEMAPHORE_FLAG_EXPORTABLE_TIMEPOINTS = 1ull << 3,
+
+  // Default flags for semaphores.
+  IREE_HAL_SEMAPHORE_FLAG_DEFAULT =
+      IREE_HAL_SEMAPHORE_FLAG_HOST_INTERRUPT |
+      IREE_HAL_SEMAPHORE_FLAG_EXPORTABLE_TIMEPOINTS,
 };
-typedef uint32_t iree_hal_semaphore_flags_t;
+typedef uint64_t iree_hal_semaphore_flags_t;
+
+// Hints how a wait operation should be performed.
+enum iree_hal_wait_flag_bits_e {
+  // Default blocking wait behavior (if able), possibly with an early query to
+  // avoid unneeded context switching. If waiting on the host and the
+  // IREE_HAL_SEMAPHORE_FLAG_HOST_INTERRUPT is not set the wait may fall back to
+  // an active wait.
+  IREE_HAL_WAIT_FLAG_DEFAULT = 0ull,
+
+  // Waiting thread will spin (possibly with backoff) until the wait condition
+  // has been satisfied or the deadline expires.
+  IREE_HAL_WAIT_FLAG_ACTIVE = 1ull << 0,
+};
+typedef uint64_t iree_hal_wait_flags_t;
 
 // The maximum valid payload value of an iree_hal_semaphore_t.
 // Payload values larger than this indicate that the semaphore has failed.
@@ -289,9 +333,15 @@
 // Creates a semaphore that can be used with command queues owned by this
 // device. To use the semaphores with other devices or instances they must
 // first be exported.
+//
+// By default the |queue_affinity| is a hint to the implementation of which
+// queues on the |device| will signal or wait on the semaphore. If
+// IREE_HAL_SEMAPHORE_FLAG_DEVICE_LOCAL is specified the |queue_affinity| will
+// indicate the semaphore is _only_ signaled or waited on those specific queues.
 IREE_API_EXPORT iree_status_t iree_hal_semaphore_create(
-    iree_hal_device_t* device, uint64_t initial_value,
-    iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore);
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    uint64_t initial_value, iree_hal_semaphore_flags_t flags,
+    iree_hal_semaphore_t** out_semaphore);
 
 // Retains the given |semaphore| for the caller.
 IREE_API_EXPORT void iree_hal_semaphore_retain(iree_hal_semaphore_t* semaphore);
@@ -338,8 +388,9 @@
 // Returns IREE_STATUS_ABORTED if one or more semaphores has failed. Callers can
 // use iree_hal_semaphore_query on the semaphores to find the ones that have
 // failed and get the status.
-IREE_API_EXPORT iree_status_t iree_hal_semaphore_wait(
-    iree_hal_semaphore_t* semaphore, uint64_t value, iree_timeout_t timeout);
+IREE_API_EXPORT iree_status_t
+iree_hal_semaphore_wait(iree_hal_semaphore_t* semaphore, uint64_t value,
+                        iree_timeout_t timeout, iree_hal_wait_flags_t flags);
 
 // Returns a wait source reference to |semaphore| after it reaches or exceeds
 // the specified payload |value|.
@@ -422,7 +473,8 @@
 // wait should be used to perform a join that will propagate failures from any
 // semaphore used in timepoints.
 IREE_API_EXPORT iree_status_t iree_hal_semaphore_list_wait(
-    iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout);
+    iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout,
+    iree_hal_wait_flags_t flags);
 
 //===----------------------------------------------------------------------===//
 // iree_hal_semaphore_t implementation details
@@ -439,7 +491,8 @@
                            iree_status_t status);
 
   iree_status_t(IREE_API_PTR* wait)(iree_hal_semaphore_t* semaphore,
-                                    uint64_t value, iree_timeout_t timeout);
+                                    uint64_t value, iree_timeout_t timeout,
+                                    iree_hal_wait_flags_t flags);
 
   iree_status_t(IREE_API_PTR* import_timepoint)(
       iree_hal_semaphore_t* semaphore, uint64_t value,
diff --git a/runtime/src/iree/hal/utils/debug_allocator.c b/runtime/src/iree/hal/utils/debug_allocator.c
index d0f7f43..758d3ca 100644
--- a/runtime/src/iree/hal/utils/debug_allocator.c
+++ b/runtime/src/iree/hal/utils/debug_allocator.c
@@ -158,8 +158,9 @@
               IREE_HAL_QUEUE_AFFINITY_ANY, 1, &command, &command_buffer));
 
   iree_hal_semaphore_t* semaphore = NULL;
-  iree_status_t status = iree_hal_semaphore_create(
-      device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore);
+  iree_status_t status =
+      iree_hal_semaphore_create(device, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore);
 
   uint64_t signal_value = 1ull;
   if (iree_status_is_ok(status)) {
@@ -176,7 +177,8 @@
 
   if (iree_status_is_ok(status)) {
     status = iree_hal_semaphore_wait(semaphore, signal_value,
-                                     iree_infinite_timeout());
+                                     iree_infinite_timeout(),
+                                     IREE_HAL_WAIT_FLAG_DEFAULT);
   }
 
   iree_hal_semaphore_release(semaphore);
diff --git a/runtime/src/iree/hal/utils/file_transfer.c b/runtime/src/iree/hal/utils/file_transfer.c
index 199545d..6089af6 100644
--- a/runtime/src/iree/hal/utils/file_transfer.c
+++ b/runtime/src/iree/hal/utils/file_transfer.c
@@ -275,9 +275,9 @@
 
     // Create semaphore for tracking worker progress.
     worker->pending_timepoint = 0ull;
-    status = iree_hal_semaphore_create(device, worker->pending_timepoint,
-                                       IREE_HAL_SEMAPHORE_FLAG_NONE,
-                                       &worker->semaphore);
+    status = iree_hal_semaphore_create(
+        device, IREE_HAL_QUEUE_AFFINITY_ANY, worker->pending_timepoint,
+        IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &worker->semaphore);
     if (!iree_status_is_ok(status)) break;
   }
 
diff --git a/runtime/src/iree/hal/utils/semaphore_base_test.cc b/runtime/src/iree/hal/utils/semaphore_base_test.cc
index 33d8a3c..f0afb31 100644
--- a/runtime/src/iree/hal/utils/semaphore_base_test.cc
+++ b/runtime/src/iree/hal/utils/semaphore_base_test.cc
@@ -97,7 +97,8 @@
   }
 
   static iree_status_t Wait(iree_hal_semaphore_t* base_semaphore,
-                            uint64_t value, iree_timeout_t timeout) {
+                            uint64_t value, iree_timeout_t timeout,
+                            iree_hal_wait_flags_t flags) {
     auto* semaphore = Cast(base_semaphore);
     struct notify_state_t {
       TestSemaphore* semaphore;
@@ -306,8 +307,8 @@
   iree_event_set(&ev0);
 
   // Wait for the semaphore to be signaled.
-  IREE_ASSERT_OK(
-      iree_hal_semaphore_wait(*semaphore, 1ull, iree_infinite_timeout()));
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(
+      *semaphore, 1ull, iree_infinite_timeout(), IREE_HAL_WAIT_FLAG_DEFAULT));
 
   // Should have been called back on the thread.
   ASSERT_EQ(state.callback_count, 1);
diff --git a/runtime/src/iree/io/parameter_index_provider.c b/runtime/src/iree/io/parameter_index_provider.c
index 1a5066e..3043023 100644
--- a/runtime/src/iree/io/parameter_index_provider.c
+++ b/runtime/src/iree/io/parameter_index_provider.c
@@ -403,10 +403,11 @@
   const bool is_first_timeline_use = timeline_semaphore == NULL;
   if (!timeline_semaphore) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_hal_semaphore_create(
-                batch->device, batch->timeline_values[timeline_index],
-                IREE_HAL_SEMAPHORE_FLAG_NONE,
-                &batch->timeline_semaphores[timeline_index]));
+        z0,
+        iree_hal_semaphore_create(batch->device, IREE_HAL_QUEUE_AFFINITY_ANY,
+                                  batch->timeline_values[timeline_index],
+                                  IREE_HAL_SEMAPHORE_FLAG_DEFAULT,
+                                  &batch->timeline_semaphores[timeline_index]));
     timeline_semaphore = batch->timeline_semaphores[timeline_index];
   }
   const uint64_t previous_timeline_value =
diff --git a/runtime/src/iree/modules/check/module.cc b/runtime/src/iree/modules/check/module.cc
index 586921e..a77298e 100644
--- a/runtime/src/iree/modules/check/module.cc
+++ b/runtime/src/iree/modules/check/module.cc
@@ -336,8 +336,9 @@
 
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer.get()));
   vm::ref<iree_hal_semaphore_t> semaphore;
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(
-      device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_create(device, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
   vm::ref<iree_hal_fence_t> fence;
   IREE_RETURN_IF_ERROR(iree_hal_fence_create_at(
       semaphore.get(), 1ull, iree_hal_device_host_allocator(device), &fence));
@@ -345,8 +346,8 @@
       device, IREE_HAL_QUEUE_AFFINITY_ANY, iree_hal_semaphore_list_empty(),
       iree_hal_fence_semaphore_list(fence.get()), command_buffer.get(),
       iree_hal_buffer_binding_table_empty(), IREE_HAL_EXECUTE_FLAG_NONE));
-  IREE_RETURN_IF_ERROR(
-      iree_hal_fence_wait(fence.get(), iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(iree_hal_fence_wait(fence.get(), iree_infinite_timeout(),
+                                           IREE_HAL_WAIT_FLAG_DEFAULT));
   return std::move(target_views);
 }
 
diff --git a/runtime/src/iree/modules/hal/module.c b/runtime/src/iree/modules/hal/module.c
index eb02b3c..4c46e44 100644
--- a/runtime/src/iree/modules/hal/module.c
+++ b/runtime/src/iree/modules/hal/module.c
@@ -1655,8 +1655,9 @@
   // This should be reworked to just create the fence.
 
   iree_hal_semaphore_t* semaphore = NULL;
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(
-      device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_create(device, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
 
   // Create fence with room for our single semaphore.
   iree_hal_fence_t* fence = NULL;
@@ -1927,7 +1928,8 @@
         // Block the native thread until the fence is reached or the deadline is
         // exceeded.
         for (iree_host_size_t i = 0; i < fence_count; ++i) {
-          wait_status = iree_hal_fence_wait(fences[i], timeout);
+          wait_status = iree_hal_fence_wait(fences[i], timeout,
+                                            IREE_HAL_WAIT_FLAG_DEFAULT);
           if (!iree_status_is_ok(wait_status)) break;
         }
       } else {
diff --git a/runtime/src/iree/tooling/function_util.c b/runtime/src/iree/tooling/function_util.c
index 0c80d23..477abdf 100644
--- a/runtime/src/iree/tooling/function_util.c
+++ b/runtime/src/iree/tooling/function_util.c
@@ -25,8 +25,9 @@
   // Create the signal fence as a 0->1 transition. The caller will wait on that.
   iree_hal_semaphore_t* semaphore = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_semaphore_create(device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE,
-                                    &semaphore));
+      z0,
+      iree_hal_semaphore_create(device, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
   iree_hal_fence_t* signal_fence = NULL;
   iree_status_t status = iree_hal_fence_create_at(
       semaphore, 1ull, iree_hal_device_host_allocator(device), &signal_fence);
@@ -111,8 +112,9 @@
   if (needs_wait) {
     iree_hal_semaphore_t* semaphore = NULL;
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_hal_semaphore_create(
-                device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+        z0,
+        iree_hal_semaphore_create(device, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+                                  IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
     status = iree_hal_fence_create_at(
         semaphore, 1ull, iree_hal_device_host_allocator(device), &signal_fence);
     iree_hal_semaphore_release(semaphore);
@@ -128,7 +130,8 @@
   }
 
   if (iree_status_is_ok(status) && needs_wait) {
-    status = iree_hal_fence_wait(signal_fence, iree_infinite_timeout());
+    status = iree_hal_fence_wait(signal_fence, iree_infinite_timeout(),
+                                 IREE_HAL_WAIT_FLAG_DEFAULT);
   }
 
   iree_hal_fence_release(signal_fence);
diff --git a/runtime/src/iree/tooling/run_module.c b/runtime/src/iree/tooling/run_module.c
index 2fa4082..8c09a16 100644
--- a/runtime/src/iree/tooling/run_module.c
+++ b/runtime/src/iree/tooling/run_module.c
@@ -260,7 +260,8 @@
   // If the function is async we need to wait for it to complete.
   if (iree_status_is_ok(status) && finish_fence) {
     IREE_RETURN_IF_ERROR(
-        iree_hal_fence_wait(finish_fence, iree_infinite_timeout()),
+        iree_hal_fence_wait(finish_fence, iree_infinite_timeout(),
+                            IREE_HAL_WAIT_FLAG_DEFAULT),
         "waiting on finish fence");
   }
   iree_hal_fence_release(finish_fence);
diff --git a/samples/custom_module/async/main.c b/samples/custom_module/async/main.c
index 6617e72..b6233bf 100644
--- a/samples/custom_module/async/main.c
+++ b/samples/custom_module/async/main.c
@@ -105,8 +105,9 @@
   // We'll pass these in with the timeline at T=0 so that the runtime isn't
   // allowed to execute anything until we give it the go-ahead.
   iree_hal_semaphore_t* semaphore = NULL;
-  IREE_CHECK_OK(iree_hal_semaphore_create(
-      device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+  IREE_CHECK_OK(iree_hal_semaphore_create(device, IREE_HAL_QUEUE_AFFINITY_ANY,
+                                          0ull, IREE_HAL_SEMAPHORE_FLAG_DEFAULT,
+                                          &semaphore));
   iree_hal_fence_t* fence_t1 = NULL;
   IREE_CHECK_OK(
       iree_hal_fence_create_at(semaphore, 1ull, host_allocator, &fence_t1));
@@ -151,7 +152,8 @@
 
   // We could go do other things now while the async work progresses. Here we
   // just immediately wait.
-  IREE_CHECK_OK(iree_hal_fence_wait(fence_t2, iree_infinite_timeout()));
+  IREE_CHECK_OK(iree_hal_fence_wait(fence_t2, iree_infinite_timeout(),
+                                    IREE_HAL_WAIT_FLAG_DEFAULT));
   fprintf(stdout, "REACHED T=2\n");
   fflush(stdout);
 
diff --git a/samples/custom_module/async/module.cc b/samples/custom_module/async/module.cc
index ca8ad71..9234730 100644
--- a/samples/custom_module/async/module.cc
+++ b/samples/custom_module/async/module.cc
@@ -137,8 +137,8 @@
     // or add the fence to a multi-wait operation. Here we just block the
     // thread until ready. Due to the nature of ordering it's possible the
     // fence has already been signaled by the time we get here.
-    Status status =
-        iree_hal_fence_wait(wait_fence_.get(), iree_infinite_timeout());
+    Status status = iree_hal_fence_wait(
+        wait_fence_.get(), iree_infinite_timeout(), IREE_HAL_WAIT_FLAG_DEFAULT);
 
     fprintf(stdout, "ASYNC: AFTER WAIT\n");
     fflush(stdout);
@@ -203,7 +203,8 @@
     // TODO(benvanik): better fence helpers when timelines are not needed.
     vm::ref<iree_hal_semaphore_t> semaphore;
     IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(
-        device_.get(), 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+        device_.get(), IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+        IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &semaphore));
     vm::ref<iree_hal_fence_t> alloca_fence;
     IREE_RETURN_IF_ERROR(iree_hal_fence_create_at(
         semaphore.get(), 1ull, host_allocator_, &alloca_fence));
diff --git a/tools/iree-benchmark-executable-main.c b/tools/iree-benchmark-executable-main.c
index ab68777..da2a6be 100644
--- a/tools/iree-benchmark-executable-main.c
+++ b/tools/iree-benchmark-executable-main.c
@@ -200,9 +200,9 @@
 
   iree_hal_semaphore_t* fence_semaphore = NULL;
   uint64_t fence_value = 0ull;
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(args->device, fence_value,
-                                                 IREE_HAL_SEMAPHORE_FLAG_NONE,
-                                                 &fence_semaphore));
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(
+      args->device, IREE_HAL_QUEUE_AFFINITY_ANY, fence_value,
+      IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &fence_semaphore));
   iree_hal_semaphore_list_t wait_semaphore_list =
       iree_hal_semaphore_list_empty();
   iree_hal_semaphore_list_t signal_semaphore_list = {
@@ -264,7 +264,8 @@
     // Note that this will include round-trip overhead and if the dispatch or
     // batch size is small then the final time may end up being mostly overhead.
     IREE_RETURN_IF_ERROR(iree_hal_semaphore_wait(fence_semaphore, fence_value,
-                                                 iree_infinite_timeout()));
+                                                 iree_infinite_timeout(),
+                                                 IREE_HAL_WAIT_FLAG_DEFAULT));
 
     iree_benchmark_pause_timing(benchmark_state);
 
diff --git a/tools/iree-benchmark-module-main.cc b/tools/iree-benchmark-module-main.cc
index fadf175..f6f5ff5 100644
--- a/tools/iree-benchmark-module-main.cc
+++ b/tools/iree-benchmark-module-main.cc
@@ -253,7 +253,8 @@
     for (int32_t i = 0; i < batch_concurrency; ++i) {
       vm::ref<iree_hal_semaphore_t> timeline_semaphore;
       IREE_CHECK_OK(iree_hal_semaphore_create(
-          device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &timeline_semaphore));
+          device, IREE_HAL_QUEUE_AFFINITY_ANY, 0ull,
+          IREE_HAL_SEMAPHORE_FLAG_DEFAULT, &timeline_semaphore));
       timeline_semaphores.push_back(std::move(timeline_semaphore));
     }
 
@@ -318,8 +319,9 @@
                            /*policy=*/nullptr, invocation_inputs[i].get(),
                            invocation_outputs[i].get(), host_allocator));
       }
-      IREE_CHECK_OK(
-          iree_hal_fence_wait(completion_fence.get(), iree_infinite_timeout()));
+      IREE_CHECK_OK(iree_hal_fence_wait(completion_fence.get(),
+                                        iree_infinite_timeout(),
+                                        IREE_HAL_WAIT_FLAG_DEFAULT));
     }
     state.PauseTiming();