Adding iree_hal_device_queue_update and improving queue DMA operations. (#19000)

As with all queue DMA operations it's best if things are batched into
command buffers but it's bad to have a command buffer with a single DMA
operation - this completes the set of fill/update/copy operations at the
queue level to match the command buffer DMA operations. Practically this
is useful when combined with reusable/indirect command buffers for
uploading new parameters in queue order prior to issuing a command
buffer that references them. The compiler will use this to turn push
constants into uniform buffers. An emulated version is added but
implementations are encouraged to do better... they currently don't.

While updating the queue API I've added placeholder flags to all DMA
operations in preparation for compiler updates that will provide them.
`iree_hal_device_queue_execute` has needed simplification for awhile and
that's done here to allow implementations to not need to worry with
batched command buffer juggling. The unused-since-its-inception
`iree_hal_command_buffer_discard_buffer` API has been renamed to
`iree_hal_command_buffer_advise_buffer` ahead of compiler changes that
will use it for multi-device cache management.

No breaking changes to the compiler here - future PRs will update the
HAL module and ops.
diff --git a/experimental/web/sample_webgpu/main.c b/experimental/web/sample_webgpu/main.c
index 6463fd4..8f7d4d4 100644
--- a/experimental/web/sample_webgpu/main.c
+++ b/experimental/web/sample_webgpu/main.c
@@ -794,8 +794,8 @@
     };
     status = iree_hal_device_queue_execute(
         device, IREE_HAL_QUEUE_AFFINITY_ANY, iree_hal_semaphore_list_empty(),
-        signal_semaphores, 1, &transfer_command_buffer,
-        /*binding_tables=*/NULL);
+        signal_semaphores, transfer_command_buffer,
+        iree_hal_buffer_binding_table_empty());
   }
   // TODO(scotttodd): Make this async - pass a wait source to iree_loop_wait_one
   //     1. create iree_hal_fence_t, iree_hal_fence_insert(fance, semaphore)
diff --git a/experimental/webgpu/command_buffer.c b/experimental/webgpu/command_buffer.c
index 9240320..e4ad81b 100644
--- a/experimental/webgpu/command_buffer.c
+++ b/experimental/webgpu/command_buffer.c
@@ -575,9 +575,10 @@
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_webgpu_command_buffer_discard_buffer(
+static iree_status_t iree_hal_webgpu_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   // No-op: though maybe it'd be a useful addition to the spec as otherwise
   // false dependencies can creep in.
   return iree_ok_status();
@@ -608,7 +609,7 @@
 static iree_status_t iree_hal_webgpu_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_webgpu_command_buffer_t* command_buffer =
       iree_hal_webgpu_command_buffer_cast(base_command_buffer);
 
@@ -693,7 +694,8 @@
 
 static iree_status_t iree_hal_webgpu_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_webgpu_command_buffer_t* command_buffer =
       iree_hal_webgpu_command_buffer_cast(base_command_buffer);
 
@@ -734,7 +736,8 @@
 
 static iree_status_t iree_hal_webgpu_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   iree_hal_webgpu_command_buffer_t* command_buffer =
       iree_hal_webgpu_command_buffer_cast(base_command_buffer);
 
@@ -1041,7 +1044,7 @@
     .signal_event = iree_hal_webgpu_command_buffer_signal_event,
     .reset_event = iree_hal_webgpu_command_buffer_reset_event,
     .wait_events = iree_hal_webgpu_command_buffer_wait_events,
-    .discard_buffer = iree_hal_webgpu_command_buffer_discard_buffer,
+    .advise_buffer = iree_hal_webgpu_command_buffer_advise_buffer,
     .fill_buffer = iree_hal_webgpu_command_buffer_fill_buffer,
     .update_buffer = iree_hal_webgpu_command_buffer_update_buffer,
     .copy_buffer = iree_hal_webgpu_command_buffer_copy_buffer,
diff --git a/experimental/webgpu/webgpu_device.c b/experimental/webgpu/webgpu_device.c
index 5498caf..c9a2457 100644
--- a/experimental/webgpu/webgpu_device.c
+++ b/experimental/webgpu/webgpu_device.c
@@ -354,7 +354,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_read_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -376,7 +376,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_write_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -396,9 +396,8 @@
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table) {
   iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
 
   // TODO(benvanik): this currently assumes we are synchronizing on semaphores
@@ -410,11 +409,8 @@
                                                     iree_infinite_timeout()));
 
   // TODO(benvanik): propagate errors to semaphores.
-  for (iree_host_size_t i = 0; i < command_buffer_count; i++) {
-    iree_hal_command_buffer_t* command_buffer = command_buffers[i];
-    IREE_RETURN_IF_ERROR(
-        iree_hal_webgpu_command_buffer_issue(command_buffer, device->queue));
-  }
+  IREE_RETURN_IF_ERROR(
+      iree_hal_webgpu_command_buffer_issue(command_buffer, device->queue));
 
   IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list));
 
@@ -473,6 +469,9 @@
         iree_hal_webgpu_device_query_semaphore_compatibility,
     .queue_alloca = iree_hal_webgpu_device_queue_alloca,
     .queue_dealloca = iree_hal_webgpu_device_queue_dealloca,
+    .queue_fill = iree_hal_device_queue_emulated_fill,
+    .queue_update = iree_hal_device_queue_emulated_update,
+    .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_webgpu_device_queue_read,
     .queue_write = iree_hal_webgpu_device_queue_write,
     .queue_execute = iree_hal_webgpu_device_queue_execute,
diff --git a/integrations/pjrt/src/iree_pjrt/common/api_impl.cc b/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
index 694cfa8..a7c5cc7 100644
--- a/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
+++ b/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
@@ -590,8 +590,8 @@
       device_.device(), IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/iree_hal_fence_semaphore_list(ready_fence_.get()),
       /*signal_semaphore_list=*/
-      iree_hal_fence_semaphore_list(dst_buffer_ready_fence.get()),
-      /*command_buffer_count=*/1, &transfer_cb, NULL));
+      iree_hal_fence_semaphore_list(dst_buffer_ready_fence.get()), transfer_cb,
+      iree_hal_buffer_binding_table_empty()));
 
   *out_done_event = copy_done_event;
   return iree_ok_status();
@@ -837,7 +837,8 @@
   IREE_CHECK_OK(iree_hal_command_buffer_begin(transfer_cb.get()));
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_fill_buffer(
       transfer_cb.get(), buffer.get(), /*target_offset=*/0,
-      /*target_size=*/byte_length, data, element_type_byte_size));
+      /*target_size=*/byte_length, data, element_type_byte_size,
+      IREE_HAL_FILL_FLAG_NONE));
   IREE_CHECK_OK(iree_hal_command_buffer_end(transfer_cb.get()));
 
   // Execute the enqueued splat:
@@ -846,8 +847,8 @@
       /*wait_semaphore_list=*/
       {1, &transfer_timeline_, &signal_alloca_complete},
       /*signal_semaphore_list=*/
-      {1, &transfer_timeline_, &signal_copy_complete},
-      /*command_buffer_count=*/1, &transfer_cb, NULL));
+      {1, &transfer_timeline_, &signal_copy_complete}, transfer_cb,
+      iree_hal_buffer_binding_table_empty()));
 
   // Wrap in a buffer view and return:
   iree::vm::ref<iree_hal_buffer_view_t> result_buffer_view;
@@ -1190,8 +1191,8 @@
       /*wait_semaphore_list=*/
       {1, &transfer_timeline_, &signal_alloca_complete},
       /*signal_semaphore_list=*/
-      {1, &transfer_timeline_, &signal_copy_complete},
-      /*command_buffer_count=*/1, &transfer_cb, NULL));
+      {1, &transfer_timeline_, &signal_copy_complete}, transfer_cb,
+      iree_hal_buffer_binding_table_empty()));
 
   // Wrap in a buffer view and return.
   iree::vm::ref<iree_hal_buffer_view_t> result_buffer_view;
diff --git a/integrations/pjrt/src/iree_pjrt/common/iree_helpers.h b/integrations/pjrt/src/iree_pjrt/common/iree_helpers.h
index fce48ba..7577619 100644
--- a/integrations/pjrt/src/iree_pjrt/common/iree_helpers.h
+++ b/integrations/pjrt/src/iree_pjrt/common/iree_helpers.h
@@ -139,8 +139,7 @@
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers) {
+    iree_hal_command_buffer_t* command_buffer) {
   if (LOGGING_ENABLED) {
     LogInvoke(__func__, "device=%p, wait={%s}, signal={%s}", device,
               SemaphoreListToString(wait_semaphore_list).c_str(),
@@ -148,8 +147,8 @@
   }
   return HandleStatus(__func__, iree_hal_device_queue_execute(
                                     device, queue_affinity, wait_semaphore_list,
-                                    signal_semaphore_list, command_buffer_count,
-                                    command_buffers, /*binding_tables=*/NULL));
+                                    signal_semaphore_list, command_buffer,
+                                    iree_hal_buffer_binding_table_empty()));
 }
 
 iree_status_t hal_fence_create(iree_host_size_t capacity,
diff --git a/runtime/bindings/python/hal.cc b/runtime/bindings/python/hal.cc
index 7a0e0cd..a25a4ad 100644
--- a/runtime/bindings/python/hal.cc
+++ b/runtime/bindings/python/hal.cc
@@ -499,7 +499,7 @@
       "deallocating memory on queue");
 }
 
-void HalDevice::QueueExecute(py::handle command_buffers,
+void HalDevice::QueueExecute(py::handle command_buffer,
                              py::handle wait_semaphores,
                              py::handle signal_semaphores) {
   iree_hal_semaphore_list_t wait_list;
@@ -548,17 +548,14 @@
   }
 
   // Unpack command buffers.
-  size_t cb_count = py::len(command_buffers);
-  iree_hal_command_buffer_t** cb_list =
-      static_cast<iree_hal_command_buffer_t**>(
-          alloca(sizeof(iree_hal_command_buffer_t*) * cb_count));
-  for (size_t i = 0; i < cb_count; ++i) {
-    cb_list[i] = py::cast<HalCommandBuffer*>(command_buffers[i])->raw_ptr();
-  }
+  iree_hal_command_buffer_t* cb =
+      !command_buffer.is_none()
+          ? py::cast<HalCommandBuffer*>(command_buffer)->raw_ptr()
+          : NULL;
 
   CheckApiStatus(iree_hal_device_queue_execute(
                      raw_ptr(), IREE_HAL_QUEUE_AFFINITY_ANY, wait_list,
-                     signal_list, cb_count, cb_list, /*binding_tables=*/NULL),
+                     signal_list, cb, iree_hal_buffer_binding_table_empty()),
                  "executing command buffers");
 }
 
@@ -619,11 +616,12 @@
         "Source and buffer length must be less than the target buffer length "
         "and it does not. Please check allocations");
   }
-  CheckApiStatus(iree_hal_device_queue_copy(
-                     raw_ptr(), IREE_HAL_QUEUE_AFFINITY_ANY, wait_list,
-                     signal_list, source_buffer.raw_ptr(), 0,
-                     target_buffer.raw_ptr(), 0, source_length),
-                 "Copying buffer on queue");
+  CheckApiStatus(
+      iree_hal_device_queue_copy(
+          raw_ptr(), IREE_HAL_QUEUE_AFFINITY_ANY, wait_list, signal_list,
+          source_buffer.raw_ptr(), 0, target_buffer.raw_ptr(), 0, source_length,
+          IREE_HAL_COPY_FLAG_NONE),
+      "Copying buffer on queue");
 }
 
 py::object HalDevice::CreateDLPackCapsule(HalBufferView& buffer_view,
@@ -1729,7 +1727,8 @@
                     iree_hal_make_buffer_ref(source_buffer.raw_ptr(),
                                              source_offset, resolved_length),
                     iree_hal_make_buffer_ref(target_buffer.raw_ptr(),
-                                             target_offset, resolved_length)),
+                                             target_offset, resolved_length),
+                    IREE_HAL_COPY_FLAG_NONE),
                 "copy command");
             if (end) {
               CheckApiStatus(iree_hal_command_buffer_end(self.raw_ptr()),
@@ -1767,7 +1766,8 @@
                     self.raw_ptr(),
                     iree_hal_make_buffer_ref(target_buffer.raw_ptr(),
                                              target_offset, resolved_length),
-                    pattern_view.buf, pattern_view.len),
+                    pattern_view.buf, pattern_view.len,
+                    IREE_HAL_FILL_FLAG_NONE),
                 "command buffer fill");
             if (end) {
               CheckApiStatus(iree_hal_command_buffer_end(self.raw_ptr()),
diff --git a/runtime/bindings/python/iree/runtime/_binding.pyi b/runtime/bindings/python/iree/runtime/_binding.pyi
index 040b92f..b4ef2ba 100644
--- a/runtime/bindings/python/iree/runtime/_binding.pyi
+++ b/runtime/bindings/python/iree/runtime/_binding.pyi
@@ -185,7 +185,7 @@
     ) -> None: ...
     def queue_execute(
         self,
-        command_buffers: Sequence[HalCommandBuffer],
+        command_buffer: HalCommandBuffer,
         wait_semaphores: HalSemaphoreList,
         signal_semaphores: HalSemaphoreList,
     ) -> None: ...
diff --git a/runtime/bindings/python/tests/hal_test.py b/runtime/bindings/python/tests/hal_test.py
index 21079c9..348a628 100644
--- a/runtime/bindings/python/tests/hal_test.py
+++ b/runtime/bindings/python/tests/hal_test.py
@@ -463,7 +463,7 @@
 
         sem = self.device.create_semaphore(0)
         self.device.queue_execute(
-            [cb], wait_semaphores=[(sem, 0)], signal_semaphores=[(sem, 1)]
+            cb, wait_semaphores=[(sem, 0)], signal_semaphores=[(sem, 1)]
         )
         iree.runtime.HalFence.create_at(sem, 1).wait()
 
@@ -479,7 +479,7 @@
 
         sem = self.device.create_semaphore(0)
         self.device.queue_execute(
-            [cb],
+            cb,
             wait_semaphores=iree.runtime.HalFence.create_at(sem, 0),
             signal_semaphores=iree.runtime.HalFence.create_at(sem, 1),
         )
diff --git a/runtime/src/iree/hal/buffer_transfer.c b/runtime/src/iree/hal/buffer_transfer.c
index bb4780f..d4f30bb 100644
--- a/runtime/src/iree/hal/buffer_transfer.c
+++ b/runtime/src/iree/hal/buffer_transfer.c
@@ -78,7 +78,7 @@
     };
     status = iree_hal_device_queue_execute(
         device, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphores, signal_semaphores,
-        1, &command_buffer, /*binding_tables=*/NULL);
+        command_buffer, iree_hal_buffer_binding_table_empty());
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_semaphore_wait(fence_semaphore, signal_value, timeout);
diff --git a/runtime/src/iree/hal/command_buffer.c b/runtime/src/iree/hal/command_buffer.c
index 44d767a..7f26263 100644
--- a/runtime/src/iree/hal/command_buffer.c
+++ b/runtime/src/iree/hal/command_buffer.c
@@ -405,25 +405,27 @@
   return status;
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_advise_buffer(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t buffer_ref,
+    iree_hal_memory_advise_flags_t flags, uint64_t arg0, uint64_t arg1) {
   IREE_ASSERT_ARGUMENT(command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
   IF_VALIDATING(command_buffer, {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_hal_command_buffer_discard_buffer_validation(
-                command_buffer, VALIDATION_STATE(command_buffer), buffer_ref));
+        z0, iree_hal_command_buffer_advise_buffer_validation(
+                command_buffer, VALIDATION_STATE(command_buffer), buffer_ref,
+                flags, arg0, arg1));
   });
-  iree_status_t status = _VTABLE_DISPATCH(command_buffer, discard_buffer)(
-      command_buffer, buffer_ref);
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, advise_buffer)(
+      command_buffer, buffer_ref, flags, arg0, arg1);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t target_ref,
-    const void* pattern, iree_host_size_t pattern_length) {
+    const void* pattern, iree_host_size_t pattern_length,
+    iree_hal_fill_flags_t flags) {
   IREE_ASSERT_ARGUMENT(command_buffer);
   if (target_ref.length == 0) {
     // No-op fill. All other validation is skipped.
@@ -434,17 +436,18 @@
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_command_buffer_fill_buffer_validation(
                 command_buffer, VALIDATION_STATE(command_buffer), target_ref,
-                pattern, pattern_length));
+                pattern, pattern_length, flags));
   });
   iree_status_t status = _VTABLE_DISPATCH(command_buffer, fill_buffer)(
-      command_buffer, target_ref, pattern, pattern_length);
+      command_buffer, target_ref, pattern, pattern_length, flags);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_update_buffer(
     iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   IREE_ASSERT_ARGUMENT(command_buffer);
   IREE_ASSERT_ARGUMENT(source_buffer);
   if (target_ref.length == 0) {
@@ -456,17 +459,17 @@
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_command_buffer_update_buffer_validation(
                 command_buffer, VALIDATION_STATE(command_buffer), source_buffer,
-                source_offset, target_ref));
+                source_offset, target_ref, flags));
   });
   iree_status_t status = _VTABLE_DISPATCH(command_buffer, update_buffer)(
-      command_buffer, source_buffer, source_offset, target_ref);
+      command_buffer, source_buffer, source_offset, target_ref, flags);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t source_ref,
-    iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t target_ref, iree_hal_copy_flags_t flags) {
   IREE_ASSERT_ARGUMENT(command_buffer);
   if (target_ref.length == 0) {
     // No-op copy. All other validation is skipped.
@@ -477,10 +480,10 @@
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_command_buffer_copy_buffer_validation(
                 command_buffer, VALIDATION_STATE(command_buffer), source_ref,
-                target_ref));
+                target_ref, flags));
   });
   iree_status_t status = _VTABLE_DISPATCH(command_buffer, copy_buffer)(
-      command_buffer, source_ref, target_ref);
+      command_buffer, source_ref, target_ref, flags);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
@@ -589,7 +592,7 @@
 
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_validate_submission(
     iree_hal_command_buffer_t* command_buffer,
-    const iree_hal_buffer_binding_table_t* binding_table) {
+    iree_hal_buffer_binding_table_t binding_table) {
   IREE_ASSERT_ARGUMENT(command_buffer);
 
   // Validate the command buffer has been recorded properly.
@@ -604,17 +607,16 @@
   // the command buffer was allocated with.
   if (command_buffer->binding_count == 0) {
     return iree_ok_status();
-  } else if (!binding_table) {
+  } else if (binding_table.count == 0) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "indirect command buffer requires at least %u "
                             "bindings but no binding table was provided",
                             command_buffer->binding_count);
-  } else if (binding_table->count < command_buffer->binding_count) {
+  } else if (binding_table.count < command_buffer->binding_count) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "indirect command buffer requires at least %u "
                             "bindings but only %" PRIhsz " were provided ",
-                            command_buffer->binding_count,
-                            binding_table->count);
+                            command_buffer->binding_count, binding_table.count);
   }
 
   // Validate the binding table against the commands consuming them.
@@ -622,7 +624,7 @@
   // requested on the command buffer.
   IF_VALIDATING(command_buffer, {
     IREE_RETURN_IF_ERROR(iree_hal_command_buffer_binding_table_validation(
-        command_buffer, VALIDATION_STATE(command_buffer), *binding_table));
+        command_buffer, VALIDATION_STATE(command_buffer), binding_table));
   });
 
   return iree_ok_status();
@@ -658,7 +660,16 @@
                                        transfer_command->fill.target_offset,
                                        transfer_command->fill.length),
               transfer_command->fill.pattern,
-              transfer_command->fill.pattern_length);
+              transfer_command->fill.pattern_length, IREE_HAL_FILL_FLAG_NONE);
+          break;
+        case IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE:
+          status = iree_hal_command_buffer_update_buffer(
+              command_buffer, transfer_command->update.source_buffer,
+              transfer_command->update.source_offset,
+              iree_hal_make_buffer_ref(transfer_command->update.target_buffer,
+                                       transfer_command->update.target_offset,
+                                       transfer_command->update.length),
+              IREE_HAL_UPDATE_FLAG_NONE);
           break;
         case IREE_HAL_TRANSFER_COMMAND_TYPE_COPY:
           status = iree_hal_command_buffer_copy_buffer(
@@ -668,15 +679,8 @@
                                        transfer_command->copy.length),
               iree_hal_make_buffer_ref(transfer_command->copy.target_buffer,
                                        transfer_command->copy.target_offset,
-                                       transfer_command->copy.length));
-          break;
-        case IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE:
-          status = iree_hal_command_buffer_update_buffer(
-              command_buffer, transfer_command->update.source_buffer,
-              transfer_command->update.source_offset,
-              iree_hal_make_buffer_ref(transfer_command->update.target_buffer,
-                                       transfer_command->update.target_offset,
-                                       transfer_command->update.length));
+                                       transfer_command->copy.length),
+              IREE_HAL_COPY_FLAG_NONE);
           break;
         default:
           status =
diff --git a/runtime/src/iree/hal/command_buffer.h b/runtime/src/iree/hal/command_buffer.h
index 7d2cded..f15f74a 100644
--- a/runtime/src/iree/hal/command_buffer.h
+++ b/runtime/src/iree/hal/command_buffer.h
@@ -214,6 +214,34 @@
   iree_hal_buffer_ref_t buffer_ref;
 } iree_hal_buffer_barrier_t;
 
+// Bitfield indicating advice for implementations managing a buffer.
+typedef uint64_t iree_hal_memory_advise_flags_t;
+enum iree_hal_memory_advise_flag_bits_t {
+  IREE_HAL_MEMORY_ADVISE_FLAG_NONE = 0,
+  // TODO(benvanik): cache control operations (invalidate/flush). arg0/arg1
+  // could source/target queue affinities.
+  // TODO(benvanik): prefetch and access type hints.
+  // TODO(benvanik): ASAN hints (protect/unprotect).
+};
+
+// Bitfield specifying flags controlling a fill operation.
+typedef uint64_t iree_hal_fill_flags_t;
+enum iree_hal_fill_flag_bits_t {
+  IREE_HAL_FILL_FLAG_NONE = 0,
+};
+
+// Bitfield specifying flags controlling an update operation.
+typedef uint64_t iree_hal_update_flags_t;
+enum iree_hal_update_flag_bits_t {
+  IREE_HAL_UPDATE_FLAG_NONE = 0,
+};
+
+// Bitfield specifying flags controlling a copy operation.
+typedef uint64_t iree_hal_copy_flags_t;
+enum iree_hal_copy_flag_bits_t {
+  IREE_HAL_COPY_FLAG_NONE = 0,
+};
+
 // Specifies the type of collective operation.
 enum iree_hal_collective_kind_e {
   // Gathers N*|element_count| elements of the specified type in |recv_binding|
@@ -391,10 +419,10 @@
     iree_hal_collective_element_type_t element_type);
 
 // Bitfield specifying flags controlling a dispatch operation.
+typedef uint64_t iree_hal_dispatch_flags_t;
 enum iree_hal_dispatch_flag_bits_t {
   IREE_HAL_DISPATCH_FLAG_NONE = 0,
 };
-typedef uint64_t iree_hal_dispatch_flags_t;
 
 // An RGBA color.
 typedef struct iree_hal_label_color_t {
@@ -669,13 +697,12 @@
     iree_host_size_t buffer_barrier_count,
     const iree_hal_buffer_barrier_t* buffer_barriers);
 
-// Hints to the device queue that the given buffer will not be used again.
-// After encoding a discard the buffer contents will be considered undefined.
-// This is because the discard may be used to elide write backs to host memory
-// or aggressively reuse the allocation for other purposes.
-IREE_API_EXPORT iree_status_t iree_hal_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* command_buffer,
-    iree_hal_buffer_ref_t buffer_ref);
+// Advises the device about the usage of the given buffer.
+// The device may use this information to perform cache management or ignore it
+// entirely.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_advise_buffer(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t buffer_ref,
+    iree_hal_memory_advise_flags_t flags, uint64_t arg0, uint64_t arg1);
 
 // Fills the target buffer with the given repeating value.
 // Expects that |pattern_length| is one of 1, 2, or 4 and that the offset and
@@ -684,7 +711,8 @@
 // device queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t target_ref,
-    const void* pattern, iree_host_size_t pattern_length);
+    const void* pattern, iree_host_size_t pattern_length,
+    iree_hal_fill_flags_t flags);
 
 // Updates a range of the given target buffer from the source host memory.
 // The source host memory is copied immediately into the command buffer and
@@ -697,7 +725,8 @@
 // device queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_update_buffer(
     iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref);
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags);
 
 // Copies a range of one buffer to another.
 // Both buffers must be compatible with the devices owned by this device
@@ -709,7 +738,7 @@
 // copies.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t source_ref,
-    iree_hal_buffer_ref_t target_ref);
+    iree_hal_buffer_ref_t target_ref, iree_hal_copy_flags_t flags);
 
 // Dispatches a collective operation defined by |op| using the given buffers.
 // |param| must be specified for operations that require a root/peer rank
@@ -763,7 +792,7 @@
 // are used by the command buffer are provided they will be ignored.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_validate_submission(
     iree_hal_command_buffer_t* command_buffer,
-    const iree_hal_buffer_binding_table_t* binding_table);
+    iree_hal_buffer_binding_table_t binding_table);
 
 //===----------------------------------------------------------------------===//
 // Utilities for command buffer creation
@@ -773,10 +802,10 @@
 typedef enum iree_hal_transfer_command_type_t {
   // iree_hal_command_buffer_fill_buffer
   IREE_HAL_TRANSFER_COMMAND_TYPE_FILL = 0u,
-  // iree_hal_command_buffer_copy_buffer
-  IREE_HAL_TRANSFER_COMMAND_TYPE_COPY = 1u,
   // iree_hal_command_buffer_update_buffer
-  IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE = 2u,
+  IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE = 1u,
+  // iree_hal_command_buffer_copy_buffer
+  IREE_HAL_TRANSFER_COMMAND_TYPE_COPY = 2u,
 } iree_hal_transfer_command_type_t;
 
 // Represents a single transfer command within a batch of commands.
@@ -792,14 +821,6 @@
       const void* pattern;
       iree_host_size_t pattern_length;
     } fill;
-    // IREE_HAL_TRANSFER_COMMAND_TYPE_COPY
-    struct {
-      iree_hal_buffer_t* source_buffer;
-      iree_device_size_t source_offset;
-      iree_hal_buffer_t* target_buffer;
-      iree_device_size_t target_offset;
-      iree_device_size_t length;
-    } copy;
     // IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE
     struct {
       const void* source_buffer;
@@ -808,6 +829,14 @@
       iree_device_size_t target_offset;
       iree_device_size_t length;
     } update;
+    // IREE_HAL_TRANSFER_COMMAND_TYPE_COPY
+    struct {
+      iree_hal_buffer_t* source_buffer;
+      iree_device_size_t source_offset;
+      iree_hal_buffer_t* target_buffer;
+      iree_device_size_t target_offset;
+      iree_device_size_t length;
+    } copy;
   };
 } iree_hal_transfer_command_t;
 
@@ -872,22 +901,25 @@
       iree_host_size_t buffer_barrier_count,
       const iree_hal_buffer_barrier_t* buffer_barriers);
 
-  iree_status_t(IREE_API_PTR* discard_buffer)(
+  iree_status_t(IREE_API_PTR* advise_buffer)(
       iree_hal_command_buffer_t* command_buffer,
-      iree_hal_buffer_ref_t buffer_ref);
+      iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+      uint64_t arg0, uint64_t arg1);
 
   iree_status_t(IREE_API_PTR* fill_buffer)(
       iree_hal_command_buffer_t* command_buffer,
       iree_hal_buffer_ref_t target_ref, const void* pattern,
-      iree_host_size_t pattern_length);
+      iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
 
   iree_status_t(IREE_API_PTR* update_buffer)(
       iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
-      iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref);
+      iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+      iree_hal_update_flags_t flags);
 
   iree_status_t(IREE_API_PTR* copy_buffer)(
       iree_hal_command_buffer_t* command_buffer,
-      iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref);
+      iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+      iree_hal_copy_flags_t flags);
 
   iree_status_t(IREE_API_PTR* collective)(
       iree_hal_command_buffer_t* command_buffer, iree_hal_channel_t* channel,
diff --git a/runtime/src/iree/hal/command_buffer_validation.c b/runtime/src/iree/hal/command_buffer_validation.c
index 832e652..2d82932 100644
--- a/runtime/src/iree/hal/command_buffer_validation.c
+++ b/runtime/src/iree/hal/command_buffer_validation.c
@@ -331,10 +331,11 @@
   return iree_ok_status();
 }
 
-iree_status_t iree_hal_command_buffer_discard_buffer_validation(
+iree_status_t iree_hal_command_buffer_advise_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
 
@@ -352,7 +353,7 @@
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
 
@@ -392,7 +393,7 @@
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     const void* source_buffer, iree_host_size_t source_offset,
-    iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t target_ref, iree_hal_update_flags_t flags) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
 
@@ -412,7 +413,8 @@
 iree_status_t iree_hal_command_buffer_copy_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
 
diff --git a/runtime/src/iree/hal/command_buffer_validation.h b/runtime/src/iree/hal/command_buffer_validation.h
index dee7bb4..5bae019 100644
--- a/runtime/src/iree/hal/command_buffer_validation.h
+++ b/runtime/src/iree/hal/command_buffer_validation.h
@@ -99,27 +99,29 @@
     iree_host_size_t buffer_barrier_count,
     const iree_hal_buffer_barrier_t* buffer_barriers);
 
-iree_status_t iree_hal_command_buffer_discard_buffer_validation(
+iree_status_t iree_hal_command_buffer_advise_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_ref_t buffer_ref);
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1);
 
 iree_status_t iree_hal_command_buffer_fill_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length);
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
 
 iree_status_t iree_hal_command_buffer_update_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     const void* source_buffer, iree_host_size_t source_offset,
-    iree_hal_buffer_ref_t target_ref);
+    iree_hal_buffer_ref_t target_ref, iree_hal_update_flags_t flags);
 
 iree_status_t iree_hal_command_buffer_copy_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref);
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags);
 
 iree_status_t iree_hal_command_buffer_collective_validation(
     iree_hal_command_buffer_t* command_buffer,
diff --git a/runtime/src/iree/hal/cts/command_buffer_copy_buffer_test.h b/runtime/src/iree/hal/cts/command_buffer_copy_buffer_test.h
index 70735c0..2cb1fef 100644
--- a/runtime/src/iree/hal/cts/command_buffer_copy_buffer_test.h
+++ b/runtime/src/iree/hal/cts/command_buffer_copy_buffer_test.h
@@ -69,7 +69,8 @@
       command_buffer, /*source_ref=*/
       iree_hal_make_buffer_ref(host_buffer, 0, kDefaultAllocationSize),
       /*target_ref=*/
-      iree_hal_make_buffer_ref(device_buffer, 0, kDefaultAllocationSize)));
+      iree_hal_make_buffer_ref(device_buffer, 0, kDefaultAllocationSize),
+      IREE_HAL_COPY_FLAG_NONE));
   IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
 
   IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer));
@@ -136,7 +137,7 @@
       command_buffer,
       iree_hal_make_buffer_ref(device_buffer, /*target_offset=*/0,
                                /*length=*/8),
-      &zero_val, /*pattern_length=*/sizeof(zero_val)));
+      &zero_val, /*pattern_length=*/sizeof(zero_val), IREE_HAL_FILL_FLAG_NONE));
   IREE_ASSERT_OK(iree_hal_command_buffer_copy_buffer(
       command_buffer,
       iree_hal_make_buffer_ref(/*source_buffer=*/host_buffer,
@@ -144,7 +145,8 @@
                                /*length=*/kDefaultAllocationSize / 2 - 4),
       iree_hal_make_buffer_ref(/*target_buffer=*/device_buffer,
                                /*target_offset=*/8,
-                               /*length=*/kDefaultAllocationSize / 2 - 4)));
+                               /*length=*/kDefaultAllocationSize / 2 - 4),
+      IREE_HAL_COPY_FLAG_NONE));
   IREE_ASSERT_OK(iree_hal_command_buffer_fill_buffer(
       command_buffer,
       iree_hal_make_buffer_ref(
@@ -153,7 +155,7 @@
           /*length=*/kDefaultAllocationSize -
               (8 + kDefaultAllocationSize / 2 - 4)),
       &zero_val,
-      /*pattern_length=*/sizeof(zero_val)));
+      /*pattern_length=*/sizeof(zero_val), IREE_HAL_FILL_FLAG_NONE));
   IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
 
   IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer));
@@ -223,7 +225,7 @@
       command_buffer,
       iree_hal_make_indirect_buffer_ref(kDeviceBufferSlot, /*offset=*/0,
                                         /*length=*/8),
-      &zero_val, /*pattern_length=*/sizeof(zero_val)));
+      &zero_val, /*pattern_length=*/sizeof(zero_val), IREE_HAL_FILL_FLAG_NONE));
   IREE_ASSERT_OK(iree_hal_command_buffer_copy_buffer(
       command_buffer,
       iree_hal_make_indirect_buffer_ref(
@@ -233,7 +235,8 @@
       iree_hal_make_indirect_buffer_ref(
           kDeviceBufferSlot,
           /*offset=*/8,
-          /*length=*/kDefaultAllocationSize / 2 - 4)));
+          /*length=*/kDefaultAllocationSize / 2 - 4),
+      IREE_HAL_COPY_FLAG_NONE));
   IREE_ASSERT_OK(iree_hal_command_buffer_fill_buffer(
       command_buffer,
       iree_hal_make_indirect_buffer_ref(
@@ -242,7 +245,7 @@
           /*length=*/kDefaultAllocationSize -
               (8 + kDefaultAllocationSize / 2 - 4)),
       &zero_val,
-      /*pattern_length=*/sizeof(zero_val)));
+      /*pattern_length=*/sizeof(zero_val), IREE_HAL_FILL_FLAG_NONE));
   IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
 
   const iree_hal_buffer_binding_t bindings[] = {
diff --git a/runtime/src/iree/hal/cts/command_buffer_fill_buffer_test.h b/runtime/src/iree/hal/cts/command_buffer_fill_buffer_test.h
index d5dc0a7..d28040c 100644
--- a/runtime/src/iree/hal/cts/command_buffer_fill_buffer_test.h
+++ b/runtime/src/iree/hal/cts/command_buffer_fill_buffer_test.h
@@ -58,7 +58,7 @@
     IREE_CHECK_OK(iree_hal_command_buffer_fill_buffer(
         command_buffer,
         iree_hal_make_buffer_ref(device_buffer, target_offset, fill_length),
-        pattern, pattern_length));
+        pattern, pattern_length, IREE_HAL_FILL_FLAG_NONE));
     IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
     IREE_CHECK_OK(SubmitCommandBufferAndWait(command_buffer));
 
@@ -94,7 +94,7 @@
     IREE_CHECK_OK(iree_hal_command_buffer_fill_buffer(
         command_buffer,
         iree_hal_make_buffer_ref(device_buffer, target_offset, fill_length),
-        pattern, pattern_length));
+        pattern, pattern_length, IREE_HAL_FILL_FLAG_NONE));
     IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
     IREE_CHECK_OK(SubmitCommandBufferAndWait(command_buffer));
 
diff --git a/runtime/src/iree/hal/cts/command_buffer_update_buffer_test.h b/runtime/src/iree/hal/cts/command_buffer_update_buffer_test.h
index 55f9b64..0ba96e4 100644
--- a/runtime/src/iree/hal/cts/command_buffer_update_buffer_test.h
+++ b/runtime/src/iree/hal/cts/command_buffer_update_buffer_test.h
@@ -43,7 +43,8 @@
   IREE_CHECK_OK(iree_hal_command_buffer_update_buffer(
       command_buffer,
       /*source_buffer=*/source_buffer.data(), /*source_offset=*/0,
-      iree_hal_make_buffer_ref(device_buffer, 0, target_buffer_size)));
+      iree_hal_make_buffer_ref(device_buffer, 0, target_buffer_size),
+      IREE_HAL_UPDATE_FLAG_NONE));
   IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
   IREE_CHECK_OK(SubmitCommandBufferAndWait(command_buffer));
 
@@ -81,7 +82,8 @@
       command_buffer,
       /*source_buffer=*/source_buffer.data(), /*source_offset=*/4,
       iree_hal_make_buffer_ref(device_buffer,
-                               /*target_offset=*/4, /*length=*/8)));
+                               /*target_offset=*/4, /*length=*/8),
+      IREE_HAL_UPDATE_FLAG_NONE));
   IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
   IREE_CHECK_OK(SubmitCommandBufferAndWait(command_buffer));
 
@@ -129,7 +131,8 @@
       command_buffer,
       /*source_buffer=*/source_buffer.data(), /*source_offset=*/4,
       iree_hal_make_buffer_ref(buffer_subspan,
-                               /*target_offset=*/4, /*length=*/4)));
+                               /*target_offset=*/4, /*length=*/4),
+      IREE_HAL_UPDATE_FLAG_NONE));
   IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
   IREE_CHECK_OK(SubmitCommandBufferAndWait(command_buffer));
 
diff --git a/runtime/src/iree/hal/cts/cts_test_base.h b/runtime/src/iree/hal/cts/cts_test_base.h
index 8799817..c388e37 100644
--- a/runtime/src/iree/hal/cts/cts_test_base.h
+++ b/runtime/src/iree/hal/cts/cts_test_base.h
@@ -219,15 +219,6 @@
       iree_hal_command_buffer_t* command_buffer,
       iree_hal_buffer_binding_table_t binding_table =
           iree_hal_buffer_binding_table_empty()) {
-    return SubmitCommandBuffersAndWait(1, &command_buffer, &binding_table);
-  }
-
-  // Submits |command_buffers| to the device and waits for them to complete
-  // before returning.
-  iree_status_t SubmitCommandBuffersAndWait(
-      iree_host_size_t command_buffer_count,
-      iree_hal_command_buffer_t** command_buffers,
-      const iree_hal_buffer_binding_table_t* binding_tables = nullptr) {
     // No wait semaphores.
     iree_hal_semaphore_list_t wait_semaphores = iree_hal_semaphore_list_empty();
 
@@ -244,8 +235,7 @@
 
     iree_status_t status = iree_hal_device_queue_execute(
         device_, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphores,
-        signal_semaphores, command_buffer_count, command_buffers,
-        binding_tables);
+        signal_semaphores, command_buffer, binding_table);
     if (iree_status_is_ok(status)) {
       status = iree_hal_semaphore_wait(signal_semaphore, target_payload_value,
                                        iree_infinite_timeout());
diff --git a/runtime/src/iree/hal/cts/event_test.h b/runtime/src/iree/hal/cts/event_test.h
index 01d56e6..c936f59 100644
--- a/runtime/src/iree/hal/cts/event_test.h
+++ b/runtime/src/iree/hal/cts/event_test.h
@@ -85,12 +85,8 @@
       /*buffer_barriers=*/NULL));
   IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer_2));
 
-  iree_hal_command_buffer_t* command_buffer_ptrs[] = {
-      command_buffer_1,
-      command_buffer_2,
-  };
-  IREE_ASSERT_OK(SubmitCommandBuffersAndWait(
-      IREE_ARRAYSIZE(command_buffer_ptrs), command_buffer_ptrs));
+  IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer_1));
+  IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer_2));
 
   iree_hal_command_buffer_release(command_buffer_1);
   iree_hal_command_buffer_release(command_buffer_2);
diff --git a/runtime/src/iree/hal/cts/file_test.h b/runtime/src/iree/hal/cts/file_test.h
index 5900977..9c8c8f9 100644
--- a/runtime/src/iree/hal/cts/file_test.h
+++ b/runtime/src/iree/hal/cts/file_test.h
@@ -111,7 +111,7 @@
       iree_hal_fence_semaphore_list(wait_fence),
       iree_hal_fence_semaphore_list(signal_fence), /*source_file=*/file,
       /*source_offset=*/0, /*target_buffer=*/buffer, /*target_offset=*/0,
-      /*length=*/file_size, /*flags=*/0));
+      /*length=*/file_size, IREE_HAL_READ_FLAG_NONE));
 
   IREE_ASSERT_OK(iree_hal_fence_wait(signal_fence, iree_infinite_timeout()));
   iree_hal_fence_release(wait_fence);
diff --git a/runtime/src/iree/hal/cts/semaphore_submission_test.h b/runtime/src/iree/hal/cts/semaphore_submission_test.h
index b745761..0943681 100644
--- a/runtime/src/iree/hal/cts/semaphore_submission_test.h
+++ b/runtime/src/iree/hal/cts/semaphore_submission_test.h
@@ -56,7 +56,7 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_,
       /*queue_affinity=*/0, iree_hal_semaphore_list_empty(), signal_semaphores,
-      1, &command_buffer, /*binding_tables=*/NULL));
+      command_buffer, iree_hal_buffer_binding_table_empty()));
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(signal_semaphore, 1, iree_infinite_timeout()));
 
@@ -88,8 +88,8 @@
 
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_,
-      /*queue_affinity=*/0, wait_semaphores, signal_semaphores, 1,
-      &command_buffer, /*binding_tables=*/NULL));
+      /*queue_affinity=*/0, wait_semaphores, signal_semaphores, command_buffer,
+      iree_hal_buffer_binding_table_empty()));
 
   // Work shouldn't start until the wait semaphore reaches its payload value.
   CheckSemaphoreValue(signal_semaphore, 100);
@@ -131,8 +131,8 @@
 
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_,
-      /*queue_affinity=*/0, wait_semaphores, signal_semaphores, 1,
-      &command_buffer, /*binding_tables=*/NULL));
+      /*queue_affinity=*/0, wait_semaphores, signal_semaphores, command_buffer,
+      iree_hal_buffer_binding_table_empty()));
 
   // Work shouldn't start until all wait semaphores reach their payload values.
   CheckSemaphoreValue(signal_semaphore_1, 0);
@@ -177,7 +177,8 @@
   // Dispatch the device command buffer to have it wait.
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY, device_wait_semaphores,
-      device_signal_semaphores, 1, &command_buffer, /*binding_tables=*/NULL));
+      device_signal_semaphores, command_buffer,
+      iree_hal_buffer_binding_table_empty()));
 
   // Start another thread and have it wait.
   std::thread thread([&]() {
@@ -241,7 +242,8 @@
   // Dispatch the device command buffer to have it wait.
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY, device_wait_semaphores,
-      device_signal_semaphores, 1, &command_buffer, /*binding_tables=*/NULL));
+      device_signal_semaphores, command_buffer,
+      iree_hal_buffer_binding_table_empty()));
 
   // Start another thread and have it wait.
   std::thread thread([&]() {
@@ -309,7 +311,8 @@
   // Dispatch the device command buffer to have it wait.
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY, device_wait_semaphores,
-      device_signal_semaphores, 1, &command_buffer, /*binding_tables=*/NULL));
+      device_signal_semaphores, command_buffer,
+      iree_hal_buffer_binding_table_empty()));
 
   // Start another thread and have it wait.
   std::thread thread([&]() {
@@ -380,8 +383,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/semaphore1_list,
-      /*signal_semaphore_list=*/semaphore2_list, 1, &command_buffer2,
-      /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/semaphore2_list, command_buffer2,
+      iree_hal_buffer_binding_table_empty()));
 
   // Make sure that the intermediate and second semaphores have not advanced
   // since only command_buffer2 is queued.
@@ -394,8 +397,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer1_wait_semaphore_list,
-      /*signal_semaphore_list=*/semaphore1_list, 1, &command_buffer1,
-      /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/semaphore1_list, command_buffer1,
+      iree_hal_buffer_binding_table_empty()));
 
   // Wait on the intermediate semaphore and check its value.
   IREE_ASSERT_OK(
@@ -449,18 +452,18 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/semaphore11_list,
-      /*signal_semaphore_list=*/semaphore22_list, 1, &command_buffer22,
-      /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/semaphore22_list, command_buffer22,
+      iree_hal_buffer_binding_table_empty()));
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/semaphore11_list,
-      /*signal_semaphore_list=*/semaphore21_list, 1, &command_buffer21,
-      /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/semaphore21_list, command_buffer21,
+      iree_hal_buffer_binding_table_empty()));
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/empty_semaphore_list,
-      /*signal_semaphore_list=*/empty_semaphore_list, 1, &command_buffer12,
-      /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/empty_semaphore_list, command_buffer12,
+      iree_hal_buffer_binding_table_empty()));
 
   // Assert that semaphores have not advance since we have not yet submitted
   // command_buffer11.
@@ -472,8 +475,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/empty_semaphore_list,
-      /*signal_semaphore_list=*/semaphore11_list, 1, &command_buffer11,
-      /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/semaphore11_list, command_buffer11,
+      iree_hal_buffer_binding_table_empty()));
 
   // Wait and check that semaphore values have advanced.
   IREE_ASSERT_OK(
@@ -544,14 +547,14 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer22_semaphore_wait_list,
-      /*signal_semaphore_list=*/command_buffer22_signal_list, 1,
-      &command_buffer22, /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer22_signal_list, command_buffer22,
+      iree_hal_buffer_binding_table_empty()));
   // We submit the command buffers in reverse order.
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer21_semaphore_wait_list,
-      /*signal_semaphore_list=*/command_buffer21_signal_list, 1,
-      &command_buffer21, /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer21_signal_list, command_buffer21,
+      iree_hal_buffer_binding_table_empty()));
 
   // Semaphores have not advance since we have not yet submitted
   // command_buffer11.
@@ -562,8 +565,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer11_semaphore_wait_list,
-      /*signal_semaphore_list=*/command_buffer11_semaphore_signal_list, 1,
-      &command_buffer11, /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer11_semaphore_signal_list,
+      command_buffer11, iree_hal_buffer_binding_table_empty()));
 
   // Wait and check that semaphore values have advanced.
   IREE_ASSERT_OK(
@@ -620,8 +623,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer2_wait_list,
-      /*signal_semaphore_list=*/command_buffer2_signal_list, 1,
-      &command_buffer2, /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer2_signal_list, command_buffer2,
+      iree_hal_buffer_binding_table_empty()));
 
   // semaphore3 must not have advanced, because it depends on semaphore1 and
   // semaphore2, which have not been signaled yet.
@@ -635,8 +638,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer1_wait_list,
-      /*signal_semaphore_list=*/command_buffer1_signal_list, 1,
-      &command_buffer1, /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer1_signal_list, command_buffer1,
+      iree_hal_buffer_binding_table_empty()));
 
   // semaphore3 must not have advanced still, because it depends on semaphore2,
   // which has not been signaled yet.
@@ -692,8 +695,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer2_wait_list,
-      /*signal_semaphore_list=*/command_buffer2_signal_list, 1,
-      &command_buffer2, /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer2_signal_list, command_buffer2,
+      iree_hal_buffer_binding_table_empty()));
 
   // Semaphores have not advance since we have not yet submitted
   // command_buffer1.
@@ -730,8 +733,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer1_wait_list,
-      /*signal_semaphore_list=*/command_buffer1_signal_list, 1,
-      &command_buffer1, /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer1_signal_list, command_buffer1,
+      iree_hal_buffer_binding_table_empty()));
 
   thread11.join();
   thread12.join();
@@ -780,8 +783,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer_wait_list,
-      /*signal_semaphore_list=*/command_buffer_signal_list, 1, &command_buffer,
-      /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer_signal_list, command_buffer,
+      iree_hal_buffer_binding_table_empty()));
 
   IREE_ASSERT_OK(
       iree_hal_semaphore_wait(semaphore2, semaphore2_signal_value,
@@ -822,8 +825,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer_wait_list,
-      /*signal_semaphore_list=*/command_buffer_signal_list, 1, &command_buffer,
-      /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer_signal_list, command_buffer,
+      iree_hal_buffer_binding_table_empty()));
 
   std::thread signal_thread(
       [&]() { IREE_ASSERT_OK(iree_hal_semaphore_signal(semaphore1, 2)); });
@@ -867,8 +870,8 @@
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       device_, IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/command_buffer_wait_list,
-      /*signal_semaphore_list=*/command_buffer_signal_list, 1, &command_buffer,
-      /*binding_tables=*/NULL));
+      /*signal_semaphore_list=*/command_buffer_signal_list, command_buffer,
+      iree_hal_buffer_binding_table_empty()));
 
   iree_status_t status =
       iree_make_status(IREE_STATUS_CANCELLED, "PropagateFailSignal test.");
diff --git a/runtime/src/iree/hal/device.c b/runtime/src/iree/hal/device.c
index 7ae9abb..f2b7d78 100644
--- a/runtime/src/iree/hal/device.c
+++ b/runtime/src/iree/hal/device.c
@@ -122,13 +122,13 @@
   return status;
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_fill(
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_fill(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   IREE_ASSERT_ARGUMENT(device);
   IREE_ASSERT_ARGUMENT(target_buffer);
   IREE_ASSERT_ARGUMENT(pattern);
@@ -162,8 +162,8 @@
                                                   &command_buffer));
 
   iree_status_t status = iree_hal_device_queue_execute(
-      device, queue_affinity, wait_semaphore_list, signal_semaphore_list, 1,
-      &command_buffer, /*binding_tables=*/NULL);
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      command_buffer, iree_hal_buffer_binding_table_empty());
 
   iree_hal_command_buffer_release(command_buffer);
 
@@ -171,13 +171,123 @@
   return status;
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_copy(
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_fill(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(
+      !wait_semaphore_list.count ||
+      (wait_semaphore_list.semaphores && wait_semaphore_list.payload_values));
+  IREE_ASSERT_ARGUMENT(!signal_semaphore_list.count ||
+                       (signal_semaphore_list.semaphores &&
+                        signal_semaphore_list.payload_values));
+  IREE_ASSERT_ARGUMENT(pattern);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
+  iree_status_t status = _VTABLE_DISPATCH(device, queue_fill)(
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      target_buffer, target_offset, length, pattern, pattern_length, flags);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    const void* source_buffer, iree_host_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_update_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
+
+  // If we are starting execution immediately then we can reduce latency by
+  // allowing inline command buffer execution.
+  iree_hal_command_buffer_mode_t command_buffer_mode =
+      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
+  if (wait_semaphore_list.count == 0) {
+    command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+  }
+
+  // TODO(benvanik): support splitting the update into multiple chunks to fit
+  // under the max command buffer update size limit. This provisional API is
+  // intended only for updating dispatch parameters today.
+  if (length > UINT16_MAX) {
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "queue buffer updates currently limited to 64KB, "
+                            "tried to update %" PRIhsz " bytes",
+                            length);
+  }
+
+  iree_hal_transfer_command_t command = {
+      .type = IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE,
+      .update =
+          {
+              .source_buffer = source_buffer,
+              .source_offset = source_offset,
+              .target_buffer = target_buffer,
+              .target_offset = target_offset,
+              .length = length,
+          },
+  };
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_create_transfer_command_buffer(device, command_buffer_mode,
+                                                  queue_affinity, 1, &command,
+                                                  &command_buffer));
+
+  iree_status_t status = iree_hal_device_queue_execute(
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      command_buffer, iree_hal_buffer_binding_table_empty());
+
+  iree_hal_command_buffer_release(command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_update(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    const void* source_buffer, iree_host_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_update_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(
+      !wait_semaphore_list.count ||
+      (wait_semaphore_list.semaphores && wait_semaphore_list.payload_values));
+  IREE_ASSERT_ARGUMENT(!signal_semaphore_list.count ||
+                       (signal_semaphore_list.semaphores &&
+                        signal_semaphore_list.payload_values));
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
+  iree_status_t status = _VTABLE_DISPATCH(device, queue_update)(
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      source_buffer, source_offset, target_buffer, target_offset, length,
+      flags);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_device_size_t length, iree_hal_copy_flags_t flags) {
   IREE_ASSERT_ARGUMENT(device);
   IREE_ASSERT_ARGUMENT(source_buffer);
   IREE_ASSERT_ARGUMENT(target_buffer);
@@ -211,8 +321,8 @@
                                                   &command_buffer));
 
   iree_status_t status = iree_hal_device_queue_execute(
-      device, queue_affinity, wait_semaphore_list, signal_semaphore_list, 1,
-      &command_buffer, /*binding_tables=*/NULL);
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      command_buffer, iree_hal_buffer_binding_table_empty());
 
   iree_hal_command_buffer_release(command_buffer);
 
@@ -220,13 +330,39 @@
   return status;
 }
 
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_copy(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_copy_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(
+      !wait_semaphore_list.count ||
+      (wait_semaphore_list.semaphores && wait_semaphore_list.payload_values));
+  IREE_ASSERT_ARGUMENT(!signal_semaphore_list.count ||
+                       (signal_semaphore_list.semaphores &&
+                        signal_semaphore_list.payload_values));
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
+  iree_status_t status = _VTABLE_DISPATCH(device, queue_copy)(
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      source_buffer, source_offset, target_buffer, target_offset, length,
+      flags);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
 IREE_API_EXPORT iree_status_t iree_hal_device_queue_read(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_read_flags_t flags) {
   IREE_ASSERT_ARGUMENT(device);
   IREE_ASSERT_ARGUMENT(
       !wait_semaphore_list.count ||
@@ -250,7 +386,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_write_flags_t flags) {
   IREE_ASSERT_ARGUMENT(device);
   IREE_ASSERT_ARGUMENT(
       !wait_semaphore_list.count ||
@@ -272,9 +408,8 @@
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table) {
   IREE_ASSERT_ARGUMENT(device);
   IREE_ASSERT_ARGUMENT(
       !wait_semaphore_list.count ||
@@ -282,44 +417,39 @@
   IREE_ASSERT_ARGUMENT(!signal_semaphore_list.count ||
                        (signal_semaphore_list.semaphores &&
                         signal_semaphore_list.payload_values));
-  IREE_ASSERT_ARGUMENT(!command_buffer_count || command_buffers);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   // TODO(benvanik): move into devices instead? then a synchronous/inline device
   // could assert the waits are resolved instead of blanket failing on an
   // already-resolved semaphore. This would make using stream-ordered
   // allocations easier.
-  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
-    if (wait_semaphore_list.count > 0 &&
-        iree_all_bits_set(
-            iree_hal_command_buffer_mode(command_buffers[i]),
-            IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) {
-      // Inline command buffers are not allowed to wait (as they could have
-      // already been executed!). This is a requirement of the API so we
-      // validate it across all backends even if they don't support inline
-      // execution and ignore it.
-      IREE_TRACE_ZONE_END(z0);
-      return iree_make_status(
-          IREE_STATUS_INVALID_ARGUMENT,
-          "inline command buffer submitted with a wait; inline command "
-          "buffers must be ready to execute immediately");
-    }
+  if (wait_semaphore_list.count > 0 && command_buffer &&
+      iree_all_bits_set(iree_hal_command_buffer_mode(command_buffer),
+                        IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) {
+    // Inline command buffers are not allowed to wait (as they could have
+    // already been executed!). This is a requirement of the API so we
+    // validate it across all backends even if they don't support inline
+    // execution and ignore it.
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "inline command buffer submitted with a wait; inline command "
+        "buffers must be ready to execute immediately");
   }
 
   // Validate command buffer bindings against the provided binding tables.
   // This will error out if a binding table is required but not provided or if
   // any binding in the table does not match the requirements of the command
   // buffer as recorded.
-  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
+  if (command_buffer) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0,
-        iree_hal_command_buffer_validate_submission(
-            command_buffers[i], binding_tables ? &binding_tables[i] : NULL));
+        z0, iree_hal_command_buffer_validate_submission(command_buffer,
+                                                        binding_table));
   }
 
   iree_status_t status = _VTABLE_DISPATCH(device, queue_execute)(
       device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
-      command_buffer_count, command_buffers, binding_tables);
+      command_buffer, binding_table);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
@@ -331,9 +461,9 @@
     const iree_hal_semaphore_list_t signal_semaphore_list) {
   IREE_ASSERT_ARGUMENT(device);
   IREE_TRACE_ZONE_BEGIN(z0);
-  iree_status_t status =
-      iree_hal_device_queue_execute(device, queue_affinity, wait_semaphore_list,
-                                    signal_semaphore_list, 0, NULL, NULL);
+  iree_status_t status = iree_hal_device_queue_execute(
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list, NULL,
+      iree_hal_buffer_binding_table_empty());
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
diff --git a/runtime/src/iree/hal/device.h b/runtime/src/iree/hal/device.h
index 82aac60..4511fa7 100644
--- a/runtime/src/iree/hal/device.h
+++ b/runtime/src/iree/hal/device.h
@@ -42,6 +42,7 @@
 // request of the calling application. Note that certain features may disable
 // runtime optimizations or require compilation flags to ensure the required
 // metadata is present in executables.
+typedef uint64_t iree_hal_device_feature_t;
 enum iree_hal_device_feature_bits_t {
   IREE_HAL_DEVICE_FEATURE_NONE = 0u,
 
@@ -67,7 +68,6 @@
   // partial embedded debug information to allow mapping back to source offsets.
   IREE_HAL_DEVICE_FEATURE_SUPPORTS_PROFILING = 1u << 2,
 };
-typedef uint32_t iree_hal_device_feature_t;
 
 // Describes an enumerated HAL device.
 typedef struct iree_hal_device_info_t {
@@ -81,6 +81,7 @@
 
 // Defines what information is captured during profiling.
 // Not all implementations will support all modes.
+typedef uint64_t iree_hal_device_profiling_mode_t;
 enum iree_hal_device_profiling_mode_bits_t {
   IREE_HAL_DEVICE_PROFILING_MODE_NONE = 0u,
 
@@ -98,7 +99,6 @@
   // be used when investigating the performance of an individual dispatch.
   IREE_HAL_DEVICE_PROFILING_MODE_EXECUTABLE_COUNTERS = 1u << 2,
 };
-typedef uint32_t iree_hal_device_profiling_mode_t;
 
 // Controls profiling options.
 typedef struct iree_hal_device_profiling_options_t {
@@ -113,6 +113,7 @@
 } iree_hal_device_profiling_options_t;
 
 // A bitfield indicating compatible semaphore behavior for a device.
+typedef uint64_t iree_hal_semaphore_compatibility_t;
 enum iree_hal_semaphore_compatibility_bits_t {
   // Indicates (in the absence of other bits) the semaphore is not compatible
   // with the device at all. Any attempts to use the semaphore for any usage
@@ -152,7 +153,18 @@
       IREE_HAL_SEMAPHORE_COMPATIBILITY_HOST_SIGNAL |
       IREE_HAL_SEMAPHORE_COMPATIBILITY_DEVICE_SIGNAL,
 };
-typedef uint32_t iree_hal_semaphore_compatibility_t;
+
+// Bitfield specifying flags controlling a file read operation.
+typedef uint64_t iree_hal_read_flags_t;
+enum iree_hal_read_flag_bits_t {
+  IREE_HAL_READ_FLAG_NONE = 0,
+};
+
+// Bitfield specifying flags controlling a file write operation.
+typedef uint64_t iree_hal_write_flags_t;
+enum iree_hal_write_flag_bits_t {
+  IREE_HAL_WRITE_FLAG_NONE = 0,
+};
 
 // Defines how a multi-wait operation treats the results of multiple semaphores.
 typedef enum iree_hal_wait_mode_e {
@@ -300,11 +312,12 @@
     iree_hal_buffer_t* buffer);
 
 // Enqueues a single queue-ordered fill operation.
+// The |target_buffer| must be visible to the device queue performing the fill.
 //
 // WARNING: individual fills have a high overhead and batching should be
 // performed by the caller instead of calling this multiple times. The
 // iree_hal_create_transfer_command_buffer utility makes it easy to create
-// batches of transfer operations (fill, copy, update) and is only a few lines
+// batches of transfer operations (fill, update, copy) and is only a few lines
 // more code.
 IREE_API_EXPORT iree_status_t iree_hal_device_queue_fill(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
@@ -312,14 +325,38 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, const void* pattern,
-    iree_host_size_t pattern_length);
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
 
-// Enqueues a single queue-ordered copy operation.
+// Enqueues a single queue-ordered buffer update operation.
+// The provided |source_buffer| will be captured and need not remain live or
+// unchanged while the operation is queued. The |target_buffer| must be visible
+// to the device queue performing the update.
+//
+// Some implementations may have limits on the size of the update or may perform
+// poorly if the size is larger than an implementation-defined limit. Updates
+// should be kept as small and infrequent as possible.
 //
 // WARNING: individual copies have a high overhead and batching should be
 // performed by the caller instead of calling this multiple times. The
 // iree_hal_create_transfer_command_buffer utility makes it easy to create
-// batches of transfer operations (fill, copy, update) and is only a few lines
+// batches of transfer operations (fill, update, copy) and is only a few lines
+// more code.
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_update(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    const void* source_buffer, iree_host_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_update_flags_t flags);
+
+// Enqueues a single queue-ordered copy operation.
+// The |source_buffer| and |target_buffer| must both be visible to the device
+// queue performing the copy.
+//
+// WARNING: individual copies have a high overhead and batching should be
+// performed by the caller instead of calling this multiple times. The
+// iree_hal_create_transfer_command_buffer utility makes it easy to create
+// batches of transfer operations (fill, update, copy) and is only a few lines
 // more code.
 IREE_API_EXPORT iree_status_t iree_hal_device_queue_copy(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
@@ -327,7 +364,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length);
+    iree_device_size_t length, iree_hal_copy_flags_t flags);
 
 // Enqueues a file read operation that streams a segment of the |source_file|
 // defined by the |source_offset| and |length| into the HAL |target_buffer| at
@@ -340,7 +377,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags);
+    iree_device_size_t length, iree_hal_read_flags_t flags);
 
 // Enqueues a file write operation that streams a segment of the HAL
 // |source_buffer| defined by the |source_offset| and |length| into the
@@ -353,14 +390,14 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags);
+    iree_device_size_t length, iree_hal_write_flags_t flags);
 
-// Executes zero or more command buffers on a device queue.
-// The command buffers are executed in order as if they were recorded as one.
+// Executes a command buffer on a device queue.
 // No commands will execute until the wait fence has been reached and the signal
-// fence will be signaled when all commands have completed.
+// fence will be signaled when all commands have completed. If a command buffer
+// is omitted this will act as a barrier.
 //
-// The queue is selected based on the command buffers submitted and the
+// The queue is selected based on the command buffer submitted and the
 // |queue_affinity|. As the number of available queues can vary the
 // |queue_affinity| is used to hash into the available queues for the required
 // categories. For example if 2 queues support transfer commands and the
@@ -369,10 +406,10 @@
 // placed on to the same queue. Note that the exact hashing function is
 // implementation dependent.
 //
-// A list of binding tables matching the list of command buffers must be
-// provided if any command buffer has indirect bindings and may otherwise be
-// NULL. The binding table contents will be captured during the call and need
-// not persist after the call returns.
+// A optional binding table must be provided if the command buffer has indirect
+// bindings and may otherwise be `iree_hal_buffer_binding_table_empty()`. The
+// binding table contents will be captured during the call and need not persist
+// after the call returns.
 //
 // The submission behavior matches Vulkan's vkQueueSubmit, with each submission
 // executing its command buffers in the order they are defined but allowing the
@@ -382,9 +419,8 @@
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables);
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table);
 
 // Enqueues a barrier waiting for |wait_semaphore_list| and signaling
 // |signal_semaphore_list| when reached.
@@ -559,13 +595,37 @@
       const iree_hal_semaphore_list_t signal_semaphore_list,
       iree_hal_buffer_t* buffer);
 
+  iree_status_t(IREE_API_PTR* queue_fill)(
+      iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+      const iree_hal_semaphore_list_t wait_semaphore_list,
+      const iree_hal_semaphore_list_t signal_semaphore_list,
+      iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+      iree_device_size_t length, const void* pattern,
+      iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
+
+  iree_status_t(IREE_API_PTR* queue_update)(
+      iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+      const iree_hal_semaphore_list_t wait_semaphore_list,
+      const iree_hal_semaphore_list_t signal_semaphore_list,
+      const void* source_buffer, iree_host_size_t source_offset,
+      iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+      iree_device_size_t length, iree_hal_update_flags_t flags);
+
+  iree_status_t(IREE_API_PTR* queue_copy)(
+      iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+      const iree_hal_semaphore_list_t wait_semaphore_list,
+      const iree_hal_semaphore_list_t signal_semaphore_list,
+      iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+      iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+      iree_device_size_t length, iree_hal_copy_flags_t flags);
+
   iree_status_t(IREE_API_PTR* queue_read)(
       iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
       const iree_hal_semaphore_list_t wait_semaphore_list,
       const iree_hal_semaphore_list_t signal_semaphore_list,
       iree_hal_file_t* source_file, uint64_t source_offset,
       iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-      iree_device_size_t length, uint32_t flags);
+      iree_device_size_t length, iree_hal_read_flags_t flags);
 
   iree_status_t(IREE_API_PTR* queue_write)(
       iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
@@ -573,15 +633,14 @@
       const iree_hal_semaphore_list_t signal_semaphore_list,
       iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
       iree_hal_file_t* target_file, uint64_t target_offset,
-      iree_device_size_t length, uint32_t flags);
+      iree_device_size_t length, iree_hal_write_flags_t flags);
 
   iree_status_t(IREE_API_PTR* queue_execute)(
       iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
       const iree_hal_semaphore_list_t wait_semaphore_list,
       const iree_hal_semaphore_list_t signal_semaphore_list,
-      iree_host_size_t command_buffer_count,
-      iree_hal_command_buffer_t* const* command_buffers,
-      iree_hal_buffer_binding_table_t const* binding_tables);
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_buffer_binding_table_t binding_table);
 
   iree_status_t(IREE_API_PTR* queue_flush)(
       iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity);
@@ -600,6 +659,30 @@
 
 IREE_API_EXPORT void iree_hal_device_destroy(iree_hal_device_t* device);
 
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_fill(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    const void* source_buffer, iree_host_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_update_flags_t flags);
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_copy_flags_t flags);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
index a12043d..2ed014b 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
@@ -1008,7 +1008,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_read_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -1029,7 +1029,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_write_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -1053,16 +1053,16 @@
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table) {
   iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_status_t status = iree_hal_deferred_work_queue_enqueue(
       device->work_queue, iree_hal_cuda_device_collect_tracing_context,
       device->tracing_context, wait_semaphore_list, signal_semaphore_list,
-      command_buffer_count, command_buffers, binding_tables);
+      command_buffer ? 1 : 0, command_buffer ? &command_buffer : NULL,
+      &binding_table);
   if (iree_status_is_ok(status)) {
     // Try to advance the deferred work queue.
     status = iree_hal_deferred_work_queue_issue(device->work_queue);
@@ -1129,6 +1129,9 @@
         iree_hal_cuda_device_query_semaphore_compatibility,
     .queue_alloca = iree_hal_cuda_device_queue_alloca,
     .queue_dealloca = iree_hal_cuda_device_queue_dealloca,
+    .queue_fill = iree_hal_device_queue_emulated_fill,
+    .queue_update = iree_hal_device_queue_emulated_update,
+    .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_cuda_device_queue_read,
     .queue_write = iree_hal_cuda_device_queue_write,
     .queue_execute = iree_hal_cuda_device_queue_execute,
diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
index 3449f82..0240a70 100644
--- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
@@ -477,9 +477,10 @@
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported");
 }
 
-static iree_status_t iree_hal_cuda_graph_command_buffer_discard_buffer(
+static iree_status_t iree_hal_cuda_graph_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   // We could mark the memory as invalidated so that if this is a managed buffer
   // CUDA does not try to copy it back to the host.
   return iree_ok_status();
@@ -510,7 +511,7 @@
 static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -562,7 +563,8 @@
 
 static iree_status_t iree_hal_cuda_graph_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -626,7 +628,8 @@
 
 static iree_status_t iree_hal_cuda_graph_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -847,7 +850,7 @@
         .signal_event = iree_hal_cuda_graph_command_buffer_signal_event,
         .reset_event = iree_hal_cuda_graph_command_buffer_reset_event,
         .wait_events = iree_hal_cuda_graph_command_buffer_wait_events,
-        .discard_buffer = iree_hal_cuda_graph_command_buffer_discard_buffer,
+        .advise_buffer = iree_hal_cuda_graph_command_buffer_advise_buffer,
         .fill_buffer = iree_hal_cuda_graph_command_buffer_fill_buffer,
         .update_buffer = iree_hal_cuda_graph_command_buffer_update_buffer,
         .copy_buffer = iree_hal_cuda_graph_command_buffer_copy_buffer,
diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
index bc02895..5d6a616 100644
--- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
@@ -308,9 +308,10 @@
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported");
 }
 
-static iree_status_t iree_hal_cuda_stream_command_buffer_discard_buffer(
+static iree_status_t iree_hal_cuda_stream_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   // We could mark the memory as invalidated so that if managed CUDA does not
   // try to copy it back to the host.
   return iree_ok_status();
@@ -319,7 +320,7 @@
 static iree_status_t iree_hal_cuda_stream_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_cuda_stream_command_buffer_t* command_buffer =
       iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -371,7 +372,8 @@
 
 static iree_status_t iree_hal_cuda_stream_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_cuda_stream_command_buffer_t* command_buffer =
       iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -413,7 +415,8 @@
 
 static iree_status_t iree_hal_cuda_stream_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   iree_hal_cuda_stream_command_buffer_t* command_buffer =
       iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -599,7 +602,7 @@
         .signal_event = iree_hal_cuda_stream_command_buffer_signal_event,
         .reset_event = iree_hal_cuda_stream_command_buffer_reset_event,
         .wait_events = iree_hal_cuda_stream_command_buffer_wait_events,
-        .discard_buffer = iree_hal_cuda_stream_command_buffer_discard_buffer,
+        .advise_buffer = iree_hal_cuda_stream_command_buffer_advise_buffer,
         .fill_buffer = iree_hal_cuda_stream_command_buffer_fill_buffer,
         .update_buffer = iree_hal_cuda_stream_command_buffer_update_buffer,
         .copy_buffer = iree_hal_cuda_stream_command_buffer_copy_buffer,
diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
index d1ef975..1b6baa3 100644
--- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
@@ -486,9 +486,10 @@
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported");
 }
 
-static iree_status_t iree_hal_hip_graph_command_buffer_discard_buffer(
+static iree_status_t iree_hal_hip_graph_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   // We could mark the memory as invalidated so that if this is a managed buffer
   // HIP does not try to copy it back to the host.
   return iree_ok_status();
@@ -519,7 +520,7 @@
 static iree_status_t iree_hal_hip_graph_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_hip_graph_command_buffer_t* command_buffer =
       iree_hal_hip_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -570,7 +571,8 @@
 
 static iree_status_t iree_hal_hip_graph_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_hip_graph_command_buffer_t* command_buffer =
       iree_hal_hip_graph_command_buffer_cast(base_command_buffer);
   if (command_buffer->symbols->hipDrvGraphAddMemcpyNode == NULL) {
@@ -640,7 +642,8 @@
 
 static iree_status_t iree_hal_hip_graph_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   iree_hal_hip_graph_command_buffer_t* command_buffer =
       iree_hal_hip_graph_command_buffer_cast(base_command_buffer);
   if (command_buffer->symbols->hipDrvGraphAddMemcpyNode == NULL) {
@@ -856,7 +859,7 @@
         .signal_event = iree_hal_hip_graph_command_buffer_signal_event,
         .reset_event = iree_hal_hip_graph_command_buffer_reset_event,
         .wait_events = iree_hal_hip_graph_command_buffer_wait_events,
-        .discard_buffer = iree_hal_hip_graph_command_buffer_discard_buffer,
+        .advise_buffer = iree_hal_hip_graph_command_buffer_advise_buffer,
         .fill_buffer = iree_hal_hip_graph_command_buffer_fill_buffer,
         .update_buffer = iree_hal_hip_graph_command_buffer_update_buffer,
         .copy_buffer = iree_hal_hip_graph_command_buffer_copy_buffer,
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index d0e3c55..7f42e8d 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -1006,7 +1006,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_read_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -1027,7 +1027,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_write_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -1051,16 +1051,16 @@
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table) {
   iree_hal_hip_device_t* device = iree_hal_hip_device_cast(base_device);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_status_t status = iree_hal_deferred_work_queue_enqueue(
       device->work_queue, iree_hal_hip_device_collect_tracing_context,
       device->tracing_context, wait_semaphore_list, signal_semaphore_list,
-      command_buffer_count, command_buffers, binding_tables);
+      command_buffer ? 1 : 0, command_buffer ? &command_buffer : NULL,
+      &binding_table);
   if (iree_status_is_ok(status)) {
     // Try to advance the deferred work queue.
     status = iree_hal_deferred_work_queue_issue(device->work_queue);
@@ -1126,6 +1126,9 @@
         iree_hal_hip_device_query_semaphore_compatibility,
     .queue_alloca = iree_hal_hip_device_queue_alloca,
     .queue_dealloca = iree_hal_hip_device_queue_dealloca,
+    .queue_fill = iree_hal_device_queue_emulated_fill,
+    .queue_update = iree_hal_device_queue_emulated_update,
+    .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_hip_device_queue_read,
     .queue_write = iree_hal_hip_device_queue_write,
     .queue_execute = iree_hal_hip_device_queue_execute,
diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
index bb94053..ca5e700 100644
--- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
@@ -299,9 +299,10 @@
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported");
 }
 
-static iree_status_t iree_hal_hip_stream_command_buffer_discard_buffer(
+static iree_status_t iree_hal_hip_stream_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   // We could mark the memory as invalidated so that if managed HIP does not
   // try to copy it back to the host.
   return iree_ok_status();
@@ -310,7 +311,7 @@
 static iree_status_t iree_hal_hip_stream_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_hip_stream_command_buffer_t* command_buffer =
       iree_hal_hip_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -362,7 +363,8 @@
 
 static iree_status_t iree_hal_hip_stream_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_hip_stream_command_buffer_t* command_buffer =
       iree_hal_hip_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -404,7 +406,8 @@
 
 static iree_status_t iree_hal_hip_stream_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   iree_hal_hip_stream_command_buffer_t* command_buffer =
       iree_hal_hip_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -588,7 +591,7 @@
         .signal_event = iree_hal_hip_stream_command_buffer_signal_event,
         .reset_event = iree_hal_hip_stream_command_buffer_reset_event,
         .wait_events = iree_hal_hip_stream_command_buffer_wait_events,
-        .discard_buffer = iree_hal_hip_stream_command_buffer_discard_buffer,
+        .advise_buffer = iree_hal_hip_stream_command_buffer_advise_buffer,
         .fill_buffer = iree_hal_hip_stream_command_buffer_fill_buffer,
         .update_buffer = iree_hal_hip_stream_command_buffer_update_buffer,
         .copy_buffer = iree_hal_hip_stream_command_buffer_copy_buffer,
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_device.c b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
index 8445241..7283e58 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_device.c
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
@@ -327,7 +327,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_read_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -348,7 +348,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_write_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -363,86 +363,63 @@
   return loop_status;
 }
 
-static iree_status_t iree_hal_sync_device_apply_deferred_command_buffers(
-    iree_hal_sync_device_t* device, iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
-  // See if there are any deferred command buffers; this saves us work in cases
-  // of pure inline execution.
-  bool any_deferred = false;
-  for (iree_host_size_t i = 0; i < command_buffer_count && !any_deferred; ++i) {
-    any_deferred = iree_hal_deferred_command_buffer_isa(command_buffers[i]);
+static iree_status_t iree_hal_sync_device_apply_deferred_command_buffer(
+    iree_hal_sync_device_t* device, iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table) {
+  // If there were no deferred command buffers no-op this call - they've already
+  // been issued.
+  if (!command_buffer ||
+      !iree_hal_deferred_command_buffer_isa(command_buffer)) {
+    return iree_ok_status();
   }
-  if (!any_deferred) return iree_ok_status();
 
   // Stack allocate storage for an inline command buffer we'll use to replay
   // the deferred command buffers. We want to reset it between each apply so
   // that we don't get state carrying across.
-  iree_host_size_t max_storage_size = 0;
-  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
-    iree_hal_command_buffer_t* command_buffer = command_buffers[i];
-    iree_hal_buffer_binding_table_t binding_table =
-        binding_tables ? binding_tables[i]
-                       : iree_hal_buffer_binding_table_empty();
-    max_storage_size = iree_max(
-        max_storage_size,
-        iree_hal_inline_command_buffer_size(
-            iree_hal_command_buffer_mode(command_buffer) |
-                IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
-                IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION |
-                // NOTE: we need to validate if a binding table is provided as
-                // the bindings were not known when it was originally recorded.
-                (iree_hal_buffer_binding_table_is_empty(binding_table)
-                     ? IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED
-                     : 0),
-            /*binding_capacity=*/0));
-  }
+  iree_host_size_t storage_size = iree_hal_inline_command_buffer_size(
+      iree_hal_command_buffer_mode(command_buffer) |
+          IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
+          IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION |
+          // NOTE: we need to validate if a binding table is provided as
+          // the bindings were not known when it was originally recorded.
+          (iree_hal_buffer_binding_table_is_empty(binding_table)
+               ? IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED
+               : 0),
+      /*binding_capacity=*/0);
   iree_byte_span_t storage =
-      iree_make_byte_span(iree_alloca(max_storage_size), max_storage_size);
+      iree_make_byte_span(iree_alloca(storage_size), storage_size);
 
-  // NOTE: we ignore any inline command buffers that may be passed in as they've
-  // already executed during recording. The caller is probably in for a bad time
-  // if they mixed the two modes together!
-  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
-    iree_hal_command_buffer_t* command_buffer = command_buffers[i];
-    iree_hal_buffer_binding_table_t binding_table =
-        binding_tables ? binding_tables[i]
-                       : iree_hal_buffer_binding_table_empty();
-    if (iree_hal_deferred_command_buffer_isa(command_buffer)) {
-      // NOTE: we run unvalidated as inline command buffers don't support
-      // binding tables and can be validated entirely while recording.
-      iree_hal_command_buffer_t* inline_command_buffer = NULL;
-      IREE_RETURN_IF_ERROR(iree_hal_inline_command_buffer_initialize(
-          device->device_allocator,
-          iree_hal_command_buffer_mode(command_buffer) |
-              IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
-              IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION |
-              // NOTE: we need to validate if a binding table is provided as the
-              // bindings were not known when it was originally recorded.
-              (iree_hal_buffer_binding_table_is_empty(binding_table)
-                   ? IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED
-                   : 0),
-          iree_hal_command_buffer_allowed_categories(command_buffer),
-          IREE_HAL_QUEUE_AFFINITY_ANY,
-          /*binding_capacity=*/0, device->host_allocator, storage,
-          &inline_command_buffer));
-      iree_status_t status = iree_hal_deferred_command_buffer_apply(
-          command_buffer, inline_command_buffer, binding_table);
-      iree_hal_inline_command_buffer_deinitialize(inline_command_buffer);
-      IREE_RETURN_IF_ERROR(status);
-    }
-  }
+  // NOTE: we run unvalidated as inline command buffers don't support
+  // binding tables and can be validated entirely while recording.
+  iree_hal_command_buffer_t* inline_command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_inline_command_buffer_initialize(
+      device->device_allocator,
+      iree_hal_command_buffer_mode(command_buffer) |
+          IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
+          IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION |
+          // NOTE: we need to validate if a binding table is provided as the
+          // bindings were not known when it was originally recorded.
+          (iree_hal_buffer_binding_table_is_empty(binding_table)
+               ? IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED
+               : 0),
+      iree_hal_command_buffer_allowed_categories(command_buffer),
+      IREE_HAL_QUEUE_AFFINITY_ANY,
+      /*binding_capacity=*/0, device->host_allocator, storage,
+      &inline_command_buffer));
 
-  return iree_ok_status();
+  iree_status_t status = iree_hal_deferred_command_buffer_apply(
+      command_buffer, inline_command_buffer, binding_table);
+
+  iree_hal_inline_command_buffer_deinitialize(inline_command_buffer);
+  return status;
 }
 
 static iree_status_t iree_hal_sync_device_queue_execute(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table) {
   iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
 
   // TODO(#4680): there is some better error handling here needed; we should
@@ -457,8 +434,8 @@
 
   // Run all deferred command buffers - any we could have run inline we already
   // did during recording.
-  IREE_RETURN_IF_ERROR(iree_hal_sync_device_apply_deferred_command_buffers(
-      device, command_buffer_count, command_buffers, binding_tables));
+  IREE_RETURN_IF_ERROR(iree_hal_sync_device_apply_deferred_command_buffer(
+      device, command_buffer, binding_table));
 
   // Signal all semaphores now that batch work has completed.
   IREE_RETURN_IF_ERROR(iree_hal_sync_semaphore_multi_signal(
@@ -526,6 +503,9 @@
         iree_hal_sync_device_query_semaphore_compatibility,
     .queue_alloca = iree_hal_sync_device_queue_alloca,
     .queue_dealloca = iree_hal_sync_device_queue_dealloca,
+    .queue_fill = iree_hal_device_queue_emulated_fill,
+    .queue_update = iree_hal_device_queue_emulated_update,
+    .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_sync_device_queue_read,
     .queue_write = iree_hal_sync_device_queue_write,
     .queue_execute = iree_hal_sync_device_queue_execute,
diff --git a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
index 0e60669..316ff05 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
@@ -455,12 +455,13 @@
 }
 
 //===----------------------------------------------------------------------===//
-// iree_hal_command_buffer_discard_buffer
+// iree_hal_command_buffer_advise_buffer
 //===----------------------------------------------------------------------===//
 
-static iree_status_t iree_hal_task_command_buffer_discard_buffer(
+static iree_status_t iree_hal_task_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   return iree_ok_status();
 }
 
@@ -509,7 +510,7 @@
 static iree_status_t iree_hal_task_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_task_command_buffer_t* command_buffer =
       iree_hal_task_command_buffer_cast(base_command_buffer);
 
@@ -567,7 +568,8 @@
 
 static iree_status_t iree_hal_task_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_task_command_buffer_t* command_buffer =
       iree_hal_task_command_buffer_cast(base_command_buffer);
 
@@ -637,7 +639,8 @@
 
 static iree_status_t iree_hal_task_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   iree_hal_task_command_buffer_t* command_buffer =
       iree_hal_task_command_buffer_cast(base_command_buffer);
 
@@ -946,7 +949,7 @@
         .signal_event = iree_hal_task_command_buffer_signal_event,
         .reset_event = iree_hal_task_command_buffer_reset_event,
         .wait_events = iree_hal_task_command_buffer_wait_events,
-        .discard_buffer = iree_hal_task_command_buffer_discard_buffer,
+        .advise_buffer = iree_hal_task_command_buffer_advise_buffer,
         .fill_buffer = iree_hal_task_command_buffer_fill_buffer,
         .update_buffer = iree_hal_task_command_buffer_update_buffer,
         .copy_buffer = iree_hal_task_command_buffer_copy_buffer,
diff --git a/runtime/src/iree/hal/drivers/local_task/task_device.c b/runtime/src/iree/hal/drivers/local_task/task_device.c
index df4bb93..8aa0925 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_device.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_device.c
@@ -415,7 +415,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_read_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -436,7 +436,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_write_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -455,14 +455,13 @@
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table) {
   iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
   // NOTE: today we are not discriminating queues based on command type.
   iree_host_size_t queue_index = iree_hal_task_device_select_queue(
       device, IREE_HAL_COMMAND_CATEGORY_ANY, queue_affinity);
-  if (command_buffer_count == 0) {
+  if (command_buffer == NULL) {
     // Fast-path for barriers (fork/join/sequence).
     return iree_hal_task_queue_submit_barrier(&device->queues[queue_index],
                                               wait_semaphore_list,
@@ -471,9 +470,8 @@
   iree_hal_task_submission_batch_t batch = {
       .wait_semaphores = wait_semaphore_list,
       .signal_semaphores = signal_semaphore_list,
-      .command_buffer_count = command_buffer_count,
-      .command_buffers = command_buffers,
-      .binding_tables = binding_tables,
+      .command_buffer = command_buffer,
+      .binding_table = binding_table,
   };
   return iree_hal_task_queue_submit_commands(&device->queues[queue_index], 1,
                                              &batch);
@@ -540,6 +538,9 @@
         iree_hal_task_device_query_semaphore_compatibility,
     .queue_alloca = iree_hal_task_device_queue_alloca,
     .queue_dealloca = iree_hal_task_device_queue_dealloca,
+    .queue_fill = iree_hal_device_queue_emulated_fill,
+    .queue_update = iree_hal_device_queue_emulated_update,
+    .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_task_device_queue_read,
     .queue_write = iree_hal_task_device_queue_write,
     .queue_execute = iree_hal_task_device_queue_execute,
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.c b/runtime/src/iree/hal/drivers/local_task/task_queue.c
index dde76b5..cdd4c22 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.c
@@ -193,10 +193,10 @@
   // the submission has completed (or failed).
   iree_hal_resource_set_t* resource_set;
 
-  // Command buffers to be issued in the order they appeared in the submission.
-  iree_host_size_t command_buffer_count;
-  iree_hal_command_buffer_t** command_buffers;
-  iree_hal_buffer_binding_table_t* binding_tables;
+  // Command buffer to be issued.
+  iree_hal_command_buffer_t* command_buffer;
+  // Optional binding table for the command buffer.
+  iree_hal_buffer_binding_table_t binding_table;
 } iree_hal_task_queue_issue_cmd_t;
 
 static iree_status_t iree_hal_task_queue_issue_cmd_deferred(
@@ -265,34 +265,28 @@
   iree_hal_task_queue_issue_cmd_t* cmd = (iree_hal_task_queue_issue_cmd_t*)task;
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_status_t status = iree_ok_status();
-
   // NOTE: it's ok for there to be no command buffers - in that case the
   // submission was purely for synchronization.
-  for (iree_host_size_t i = 0; i < cmd->command_buffer_count; ++i) {
-    iree_hal_command_buffer_t* command_buffer = cmd->command_buffers[i];
-    if (iree_hal_task_command_buffer_isa(command_buffer)) {
-      if (cmd->binding_tables && cmd->binding_tables[i].count > 0) {
+  iree_status_t status = iree_ok_status();
+  if (cmd->command_buffer != NULL) {
+    if (iree_hal_task_command_buffer_isa(cmd->command_buffer)) {
+      if (cmd->binding_table.count > 0) {
         status = iree_make_status(
             IREE_STATUS_UNIMPLEMENTED,
             "task command buffers do not support binding tables yet");
       } else {
         status = iree_hal_task_command_buffer_issue(
-            command_buffer, &cmd->queue->state,
+            cmd->command_buffer, &cmd->queue->state,
             cmd->task.header.completion_task, cmd->arena, pending_submission);
       }
-    } else if (iree_hal_deferred_command_buffer_isa(command_buffer)) {
-      iree_hal_buffer_binding_table_t binding_table =
-          cmd->binding_tables ? cmd->binding_tables[i]
-                              : iree_hal_buffer_binding_table_empty();
+    } else if (iree_hal_deferred_command_buffer_isa(cmd->command_buffer)) {
       status = iree_hal_task_queue_issue_cmd_deferred(
-          cmd, command_buffer, binding_table, pending_submission);
+          cmd, cmd->command_buffer, cmd->binding_table, pending_submission);
     } else {
       status = iree_make_status(
           IREE_STATUS_UNIMPLEMENTED,
           "unsupported command buffer type for task queue submission");
     }
-    if (IREE_UNLIKELY(!iree_status_is_ok(status))) break;
   }
 
   IREE_TRACE_ZONE_END(z0);
@@ -308,21 +302,9 @@
       (iree_hal_task_submission_batch_t*)user_data;
 
   iree_hal_task_queue_issue_cmd_t* cmd = NULL;
-  iree_host_size_t command_buffers_size =
-      batch->command_buffer_count * sizeof(*cmd->command_buffers);
-  iree_host_size_t binding_tables_size = 0;
-  iree_host_size_t binding_table_elements_size = 0;
-  if (batch->binding_tables) {
-    binding_tables_size =
-        batch->command_buffer_count * sizeof(*cmd->binding_tables);
-    for (iree_host_size_t i = 0; i < batch->command_buffer_count; ++i) {
-      binding_table_elements_size += batch->binding_tables[i].count *
-                                     sizeof(*batch->binding_tables[i].bindings);
-    }
-  }
-  iree_host_size_t total_cmd_size = sizeof(*cmd) + command_buffers_size +
-                                    binding_tables_size +
-                                    binding_table_elements_size;
+  iree_host_size_t binding_table_elements_size =
+      batch->binding_table.count * sizeof(*batch->binding_table.bindings);
+  iree_host_size_t total_cmd_size = sizeof(*cmd) + binding_table_elements_size;
   IREE_RETURN_IF_ERROR(
       iree_arena_allocate(arena, total_cmd_size, (void**)&cmd));
   iree_task_call_initialize(
@@ -333,42 +315,30 @@
   cmd->queue = queue;
   cmd->resource_set = resource_set;
 
-  cmd->command_buffer_count = batch->command_buffer_count;
-  cmd->command_buffers =
-      (iree_hal_command_buffer_t**)((uint8_t*)cmd + sizeof(*cmd));
-  memcpy(cmd->command_buffers, batch->command_buffers, command_buffers_size);
+  cmd->command_buffer = batch->command_buffer;
+  cmd->binding_table = iree_hal_buffer_binding_table_empty();
 
   // Binding tables are optional and we only need this extra work if there were
   // any non-empty binding tables provided during submission.
   iree_status_t status = iree_ok_status();
   if (binding_table_elements_size > 0) {
     // Copy over binding tables and all of their contents.
-    cmd->binding_tables =
-        (iree_hal_buffer_binding_table_t*)((uint8_t*)cmd->command_buffers +
-                                           command_buffers_size);
     iree_hal_buffer_binding_t* binding_element_ptr =
-        (iree_hal_buffer_binding_t*)((uint8_t*)cmd->binding_tables +
-                                     binding_tables_size);
-    for (iree_host_size_t i = 0; i < batch->command_buffer_count; ++i) {
-      iree_host_size_t element_count = batch->binding_tables[i].count;
-      cmd->binding_tables[i].count = element_count;
-      cmd->binding_tables[i].bindings = binding_element_ptr;
-      memcpy((void*)cmd->binding_tables[i].bindings,
-             batch->binding_tables[i].bindings,
-             element_count * sizeof(*binding_element_ptr));
-      binding_element_ptr += element_count;
+        (iree_hal_buffer_binding_t*)((uint8_t*)cmd + sizeof(*cmd));
+    const iree_host_size_t element_count = batch->binding_table.count;
+    cmd->binding_table.count = element_count;
+    cmd->binding_table.bindings = binding_element_ptr;
+    memcpy((void*)cmd->binding_table.bindings, batch->binding_table.bindings,
+           element_count * sizeof(*binding_element_ptr));
+    binding_element_ptr += element_count;
 
-      // Bulk insert all bindings into the resource set. This will keep the
-      // referenced buffers live until the issue has completed. Note that if we
-      // fail here we need to clean up the resource set below before returning.
-      status = iree_hal_resource_set_insert_strided(
-          cmd->resource_set, element_count, cmd->binding_tables[i].bindings,
-          offsetof(iree_hal_buffer_binding_t, buffer),
-          sizeof(iree_hal_buffer_binding_t));
-      if (!iree_status_is_ok(status)) break;
-    }
-  } else {
-    cmd->binding_tables = NULL;
+    // Bulk insert all bindings into the resource set. This will keep the
+    // referenced buffers live until the issue has completed. Note that if we
+    // fail here we need to clean up the resource set below before returning.
+    status = iree_hal_resource_set_insert_strided(
+        cmd->resource_set, element_count, cmd->binding_table.bindings,
+        offsetof(iree_hal_buffer_binding_t, buffer),
+        sizeof(iree_hal_buffer_binding_t));
   }
 
   if (iree_status_is_ok(status)) {
@@ -684,9 +654,8 @@
   for (iree_host_size_t i = 0; i < batch_count; ++i) {
     const iree_hal_task_submission_batch_t* batch = &batches[i];
     IREE_RETURN_IF_ERROR(iree_hal_task_queue_submit(
-        queue, batch->wait_semaphores, batch->signal_semaphores,
-        batch->command_buffer_count,
-        (iree_hal_resource_t* const*)batch->command_buffers,
+        queue, batch->wait_semaphores, batch->signal_semaphores, 1,
+        (iree_hal_resource_t* const*)&batch->command_buffer,
         iree_hal_task_queue_issue_cmd_allocate, (void*)batch));
   }
   return iree_ok_status();
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.h b/runtime/src/iree/hal/drivers/local_task/task_queue.h
index 0d667ae..91065ff 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.h
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.h
@@ -22,22 +22,18 @@
 extern "C" {
 #endif  // __cplusplus
 
-// A single batch of command buffers submitted to a device queue.
+// A single command buffer submitted to a device queue.
 // All of the wait semaphores must reach or exceed the given payload values
-// prior to the batch beginning execution. Each command buffer begins execution
-// in the order it is present in the list, though note that the command buffers
-// execute concurrently and require internal synchronization via events if there
-// are any dependencies between them. Only after all command buffers have
+// prior to the batch beginning execution. Only after all commands have
 // completed will the signal semaphores be updated to the provided payload
 // values.
 typedef struct iree_hal_task_submission_batch_t {
   // Semaphores to wait on prior to executing any command buffer.
   iree_hal_semaphore_list_t wait_semaphores;
 
-  // Command buffers to execute, in order, and optional binding tables 1:1.
-  iree_host_size_t command_buffer_count;
-  iree_hal_command_buffer_t* const* command_buffers;
-  iree_hal_buffer_binding_table_t const* binding_tables;
+  // Command buffer to execute and optional binding table.
+  iree_hal_command_buffer_t* command_buffer;
+  iree_hal_buffer_binding_table_t binding_table;
 
   // Semaphores to signal once all command buffers have completed execution.
   iree_hal_semaphore_list_t signal_semaphores;
diff --git a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
index 16bd8c5..f5f4a32 100644
--- a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
+++ b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
@@ -553,8 +553,9 @@
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported");
 }
 
-static iree_status_t iree_hal_metal_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_ref_t buffer_ref) {
+static iree_status_t iree_hal_metal_command_buffer_advise_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_ref_t buffer_ref,
+    iree_hal_memory_advise_flags_t flags, uint64_t arg0, uint64_t arg1) {
   // This is a hint to the device and we have nothing to do for Metal.
   return iree_ok_status();
 }
@@ -620,7 +621,7 @@
 
 static iree_status_t iree_hal_metal_command_buffer_prepare_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_ref_t target_ref,
-    const void* pattern, iree_host_size_t pattern_length) {
+    const void* pattern, iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_metal_command_buffer_t* command_buffer =
       iree_hal_metal_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -764,7 +765,8 @@
 
 static iree_status_t iree_hal_metal_command_buffer_prepare_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_metal_command_buffer_t* command_buffer =
       iree_hal_metal_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -797,7 +799,7 @@
 
 static iree_status_t iree_hal_metal_command_buffer_prepare_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_ref_t source_ref,
-    iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t target_ref, iree_hal_copy_flags_t flags) {
   iree_hal_metal_command_buffer_t* command_buffer =
       iree_hal_metal_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -1068,7 +1070,7 @@
     .signal_event = iree_hal_metal_command_buffer_signal_event,
     .reset_event = iree_hal_metal_command_buffer_reset_event,
     .wait_events = iree_hal_metal_command_buffer_wait_events,
-    .discard_buffer = iree_hal_metal_command_buffer_discard_buffer,
+    .advise_buffer = iree_hal_metal_command_buffer_advise_buffer,
     .fill_buffer = iree_hal_metal_command_buffer_prepare_fill_buffer,
     .update_buffer = iree_hal_metal_command_buffer_prepare_update_buffer,
     .copy_buffer = iree_hal_metal_command_buffer_prepare_copy_buffer,
diff --git a/runtime/src/iree/hal/drivers/metal/metal_device.m b/runtime/src/iree/hal/drivers/metal/metal_device.m
index 72e09d8..4f8b4fd 100644
--- a/runtime/src/iree/hal/drivers/metal/metal_device.m
+++ b/runtime/src/iree/hal/drivers/metal/metal_device.m
@@ -347,7 +347,7 @@
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list, iree_hal_file_t* source_file,
     uint64_t source_offset, iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_read_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -366,7 +366,7 @@
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list, iree_hal_buffer_t* source_buffer,
     iree_device_size_t source_offset, iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_write_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -415,9 +415,8 @@
 static iree_status_t iree_hal_metal_device_queue_execute(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
-    const iree_hal_semaphore_list_t signal_semaphore_list, iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_binding_table_t binding_table) {
   iree_hal_metal_device_t* device = iree_hal_metal_device_cast(base_device);
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -434,37 +433,30 @@
                                           signal_semaphore_list.semaphores);
   }
 
-  // Translate any deferred command buffers into real Metal command buffers.
+  // Translate deferred command buffers into real Metal command buffers.
   // We do this prior to beginning execution so that if we fail we don't leave the system in an
   // inconsistent state.
-  iree_hal_command_buffer_t** direct_command_buffers = (iree_hal_command_buffer_t**)iree_alloca(
-      command_buffer_count * sizeof(iree_hal_command_buffer_t*));
-  if (iree_status_is_ok(status)) {
-    for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
-      iree_hal_command_buffer_t* command_buffer = command_buffers[i];
-      iree_hal_command_buffer_t* direct_command_buffer = NULL;
-      if (iree_hal_deferred_command_buffer_isa(command_buffer)) {
-        // Create a temporary command buffer and replay the deferred command buffer with the
-        // binding table provided. Note that any resources used will be retained by the command
-        // buffer so we only need to retain the command buffer itself instead of the binding
-        // tables provided.
-        iree_hal_buffer_binding_table_t binding_table =
-            binding_tables ? binding_tables[i] : iree_hal_buffer_binding_table_empty();
-        @autoreleasepool {
-          status = iree_hal_metal_replay_command_buffer(device, command_buffer, binding_table,
-                                                        &direct_command_buffer);
-        }
-      } else {
-        // Retain the command buffer until the submission has completed.
-        iree_hal_command_buffer_retain(command_buffer);
-        direct_command_buffer = command_buffer;
+  iree_hal_command_buffer_t* direct_command_buffer = NULL;
+  if (iree_status_is_ok(status) && command_buffer) {
+    iree_hal_command_buffer_t* direct_command_buffer = NULL;
+    if (iree_hal_deferred_command_buffer_isa(command_buffer)) {
+      // Create a temporary command buffer and replay the deferred command buffer with the
+      // binding table provided. Note that any resources used will be retained by the command
+      // buffer so we only need to retain the command buffer itself instead of the binding
+      // tables provided.
+      @autoreleasepool {
+        status = iree_hal_metal_replay_command_buffer(device, command_buffer, binding_table,
+                                                      &direct_command_buffer);
       }
-      if (!iree_status_is_ok(status)) break;
-      status = iree_hal_resource_set_insert(resource_set, 1, &direct_command_buffer);
-      if (!iree_status_is_ok(status)) break;
-      iree_hal_command_buffer_release(direct_command_buffer);  // retained in resource set
-      direct_command_buffers[i] = direct_command_buffer;
+    } else {
+      // Retain the command buffer until the submission has completed.
+      iree_hal_command_buffer_retain(command_buffer);
+      direct_command_buffer = command_buffer;
     }
+    if (iree_status_is_ok(status)) {
+      status = iree_hal_resource_set_insert(resource_set, 1, &direct_command_buffer);
+    }
+    iree_hal_command_buffer_release(direct_command_buffer);  // retained in resource set
   }
 
   if (iree_status_is_ok(status)) {
@@ -485,16 +477,14 @@
       // Then commit all recorded compute command buffers, except the last one, which we will patch
       // up with semaphore signaling.
       id<MTLCommandBuffer> signal_command_buffer = nil;
-      for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
+      if (direct_command_buffer) {
         // NOTE: translation happens above such that we always know these are direct command
         // buffers.
         //
         // TODO(indirect-cmd): support indirect command buffers and switch here, or only use
         // indirect command buffers and assume that instead.
-        iree_hal_command_buffer_t* direct_command_buffer = direct_command_buffers[i];
         id<MTLCommandBuffer> handle =
             iree_hal_metal_direct_command_buffer_handle(direct_command_buffer);
-        if (i + 1 != command_buffer_count) [handle commit];
         signal_command_buffer = handle;
       }
       if (signal_command_buffer == nil) {
@@ -627,6 +617,9 @@
     .query_semaphore_compatibility = iree_hal_metal_device_query_semaphore_compatibility,
     .queue_alloca = iree_hal_metal_device_queue_alloca,
     .queue_dealloca = iree_hal_metal_device_queue_dealloca,
+    .queue_fill = iree_hal_device_queue_emulated_fill,
+    .queue_update = iree_hal_device_queue_emulated_update,
+    .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_metal_device_queue_read,
     .queue_write = iree_hal_metal_device_queue_write,
     .queue_execute = iree_hal_metal_device_queue_execute,
diff --git a/runtime/src/iree/hal/drivers/null/command_buffer.c b/runtime/src/iree/hal/drivers/null/command_buffer.c
index 4f8fe82..16b33e9 100644
--- a/runtime/src/iree/hal/drivers/null/command_buffer.c
+++ b/runtime/src/iree/hal/drivers/null/command_buffer.c
@@ -220,9 +220,10 @@
   return status;
 }
 
-static iree_status_t iree_hal_null_command_buffer_discard_buffer(
+static iree_status_t iree_hal_null_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   iree_hal_null_command_buffer_t* command_buffer =
       iree_hal_null_command_buffer_cast(base_command_buffer);
 
@@ -241,7 +242,7 @@
 static iree_status_t iree_hal_null_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_null_command_buffer_t* command_buffer =
       iree_hal_null_command_buffer_cast(base_command_buffer);
 
@@ -257,7 +258,8 @@
 
 static iree_status_t iree_hal_null_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_null_command_buffer_t* command_buffer =
       iree_hal_null_command_buffer_cast(base_command_buffer);
 
@@ -275,7 +277,8 @@
 
 static iree_status_t iree_hal_null_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   iree_hal_null_command_buffer_t* command_buffer =
       iree_hal_null_command_buffer_cast(base_command_buffer);
 
@@ -361,7 +364,7 @@
         .signal_event = iree_hal_null_command_buffer_signal_event,
         .reset_event = iree_hal_null_command_buffer_reset_event,
         .wait_events = iree_hal_null_command_buffer_wait_events,
-        .discard_buffer = iree_hal_null_command_buffer_discard_buffer,
+        .advise_buffer = iree_hal_null_command_buffer_advise_buffer,
         .fill_buffer = iree_hal_null_command_buffer_fill_buffer,
         .update_buffer = iree_hal_null_command_buffer_update_buffer,
         .copy_buffer = iree_hal_null_command_buffer_copy_buffer,
diff --git a/runtime/src/iree/hal/drivers/null/device.c b/runtime/src/iree/hal/drivers/null/device.c
index ce12240..1195364 100644
--- a/runtime/src/iree/hal/drivers/null/device.c
+++ b/runtime/src/iree/hal/drivers/null/device.c
@@ -357,13 +357,64 @@
   return status;
 }
 
+static iree_status_t iree_hal_null_device_queue_fill(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
+  // TODO(null): if a native queue fill operation is available use that instead.
+  // The emulated fill creates a command buffer and executes it and it's best if
+  // the extra recording/upload/allocation time can be avoided.
+  return iree_hal_device_queue_emulated_fill(
+      base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      target_buffer, target_offset, length, pattern, pattern_length, flags);
+}
+
+static iree_status_t iree_hal_null_device_queue_update(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    const void* source_buffer, iree_host_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_update_flags_t flags) {
+  // TODO(null): if a native queue update operation is available use that
+  // instead. The emulated update creates a command buffer and executes it and
+  // it's best if the extra recording/upload/allocation time can be avoided.
+  // Since command buffers have a limited capacity for embedded data the
+  // emulated version may need to allocate buffers, split the update into
+  // multiple commands, or commit other sins a native implementation would be
+  // able to avoid.
+  return iree_hal_device_queue_emulated_update(
+      base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      source_buffer, source_offset, target_buffer, target_offset, length,
+      flags);
+}
+
+static iree_status_t iree_hal_null_device_queue_copy(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_copy_flags_t flags) {
+  // TODO(null): if a native queue copy operation is available use that instead.
+  // The emulated copy creates a command buffer and executes it and it's best if
+  // the extra recording/upload/allocation time can be avoided.
+  return iree_hal_device_queue_emulated_copy(
+      base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      source_buffer, source_offset, target_buffer, target_offset, length,
+      flags);
+}
+
 static iree_status_t iree_hal_null_device_queue_read(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_read_flags_t flags) {
   // TODO(null): if native support for file operations are available then
   // definitely prefer those over the emulated implementation provided here by
   // default. The implementation performs allocations, creates semaphores, and
@@ -389,7 +440,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_write_flags_t flags) {
   // TODO(null): if native support for file operations are available then
   // definitely prefer those over the emulated implementation provided here by
   // default. The implementation performs allocations, creates semaphores, and
@@ -413,9 +464,8 @@
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table) {
   iree_hal_null_device_t* device = iree_hal_null_device_cast(base_device);
 
   // TODO(null): implement a wait, execute, and signal queue operation. The
@@ -423,14 +473,11 @@
   // are to be used when executing and it can be assumed that all resources
   // required for execution are accessible on those queues. If more than one
   // queue is specified the implementation may use any it prefers from the set.
-  // If more than one command buffer is provided it is expected that they are
-  // executed in order on the same queue.
 
-  // TODO(null): optional binding tables matching 1:1 with the command buffers
-  // are provided for any indirect command buffers (those who have a
-  // binding_capacity > 0). The binding tables must be captured by the
-  // implementation as they may be mutated or freed by the caller immediately
-  // after this call returns.
+  // TODO(null): an optional binding table is provided for indirect command
+  // buffers (those who have a binding_capacity > 0). The binding table must be
+  // captured by the implementation as they may be mutated or freed by the
+  // caller immediately after this call returns.
 
   // TODO(null): do this async - callers may be submitting work to multiple
   // devices or queues on the same device from the same thread and blocking here
@@ -552,6 +599,9 @@
         iree_hal_null_device_query_semaphore_compatibility,
     .queue_alloca = iree_hal_null_device_queue_alloca,
     .queue_dealloca = iree_hal_null_device_queue_dealloca,
+    .queue_fill = iree_hal_null_device_queue_fill,
+    .queue_update = iree_hal_null_device_queue_update,
+    .queue_copy = iree_hal_null_device_queue_copy,
     .queue_read = iree_hal_null_device_queue_read,
     .queue_write = iree_hal_null_device_queue_write,
     .queue_execute = iree_hal_null_device_queue_execute,
diff --git a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
index 20782fd..b6f91d4 100644
--- a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
@@ -495,9 +495,10 @@
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_vulkan_direct_command_buffer_discard_buffer(
+static iree_status_t iree_hal_vulkan_direct_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   // NOTE: we could use this to prevent queue family transitions.
   return iree_ok_status();
 }
@@ -527,7 +528,7 @@
 static iree_status_t iree_hal_vulkan_direct_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_vulkan_direct_command_buffer_t* command_buffer =
       iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
   VkBuffer target_device_buffer =
@@ -590,7 +591,8 @@
 
 static iree_status_t iree_hal_vulkan_direct_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_vulkan_direct_command_buffer_t* command_buffer =
       iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
   VkBuffer target_device_buffer =
@@ -631,7 +633,8 @@
 
 static iree_status_t iree_hal_vulkan_direct_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   iree_hal_vulkan_direct_command_buffer_t* command_buffer =
       iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
   VkBuffer source_device_buffer =
@@ -817,8 +820,8 @@
         iree_hal_vulkan_direct_command_buffer_signal_event,
         /*.reset_event=*/iree_hal_vulkan_direct_command_buffer_reset_event,
         /*.wait_events=*/iree_hal_vulkan_direct_command_buffer_wait_events,
-        /*.discard_buffer=*/
-        iree_hal_vulkan_direct_command_buffer_discard_buffer,
+        /*.advise_buffer=*/
+        iree_hal_vulkan_direct_command_buffer_advise_buffer,
         /*.fill_buffer=*/iree_hal_vulkan_direct_command_buffer_fill_buffer,
         /*.update_buffer=*/
         iree_hal_vulkan_direct_command_buffer_update_buffer,
diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
index 3b7192d..6db27bc 100644
--- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
@@ -1666,7 +1666,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_read_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -1687,7 +1687,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags) {
+    iree_device_size_t length, iree_hal_write_flags_t flags) {
   // TODO: expose streaming chunk count/size options.
   iree_status_t loop_status = iree_ok_status();
   iree_hal_file_transfer_options_t options = {
@@ -1706,9 +1706,8 @@
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
-    iree_hal_buffer_binding_table_t const* binding_tables) {
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table) {
   iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
 
   // NOTE: today we are not discriminating queues based on command type.
@@ -1720,23 +1719,10 @@
   // buffers on demand here. When we natively support them we'll still need to
   // process the binding table prior to submission but that can be done in a
   // much more lightweight way depending on our concurrency needs.
-  if (IREE_UNLIKELY(command_buffer_count > 32)) {
-    // Guard the stack allocation, yuck.
-    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
-                            "currently limited to a reasonable number of "
-                            "command buffers per submission");
-  }
-  iree_hal_command_buffer_t** translated_command_buffers =
-      (iree_hal_command_buffer_t**)iree_alloca(
-          sizeof(iree_hal_command_buffer_t*) * command_buffer_count);
+  iree_hal_command_buffer_t* translated_command_buffer = NULL;
   iree_status_t status = iree_ok_status();
-  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
-    iree_hal_command_buffer_t* command_buffer = command_buffers[i];
-    if (iree_hal_deferred_command_buffer_isa(command_buffers[i])) {
-      iree_hal_command_buffer_t* translated_command_buffer = NULL;
-      iree_hal_buffer_binding_table_t binding_table =
-          binding_tables ? binding_tables[i]
-                         : iree_hal_buffer_binding_table_empty();
+  if (command_buffer != NULL) {
+    if (iree_hal_deferred_command_buffer_isa(command_buffer)) {
       status = iree_hal_vulkan_device_create_command_buffer(
           base_device,
           iree_hal_command_buffer_mode(command_buffer) |
@@ -1752,9 +1738,8 @@
         status = iree_hal_deferred_command_buffer_apply(
             command_buffer, translated_command_buffer, binding_table);
       }
-      translated_command_buffers[i] = translated_command_buffer;
     } else {
-      translated_command_buffers[i] = command_buffer;
+      translated_command_buffer = command_buffer;
       iree_hal_command_buffer_retain(command_buffer);
     }
   }
@@ -1762,8 +1747,10 @@
   if (iree_status_is_ok(status)) {
     iree_hal_vulkan_submission_batch_t batch = {
         /*.wait_semaphores=*/wait_semaphore_list,
-        /*.command_buffer_count=*/command_buffer_count,
-        /*.command_buffers=*/translated_command_buffers,
+        /*.command_buffer_count=*/
+        (iree_host_size_t)(translated_command_buffer ? 1 : 0),
+        /*.command_buffers=*/
+        translated_command_buffer ? &translated_command_buffer : NULL,
         /*.signal_semaphores=*/signal_semaphore_list,
     };
     status = queue->Submit(1, &batch);
@@ -1777,9 +1764,7 @@
 
   // TODO(indirect-cmd): when async these need to be retained until the
   // submission completes.
-  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
-    iree_hal_command_buffer_release(translated_command_buffers[i]);
-  }
+  iree_hal_command_buffer_release(translated_command_buffer);
 
   return status;
 }
@@ -1907,6 +1892,9 @@
     iree_hal_vulkan_device_query_semaphore_compatibility,
     /*.queue_alloca=*/iree_hal_vulkan_device_queue_alloca,
     /*.queue_dealloca=*/iree_hal_vulkan_device_queue_dealloca,
+    /*.queue_fill=*/iree_hal_device_queue_emulated_fill,
+    /*.queue_update=*/iree_hal_device_queue_emulated_update,
+    /*.queue_copy=*/iree_hal_device_queue_emulated_copy,
     /*.queue_read=*/iree_hal_vulkan_device_queue_read,
     /*.queue_write=*/iree_hal_vulkan_device_queue_write,
     /*.queue_execute=*/iree_hal_vulkan_device_queue_execute,
diff --git a/runtime/src/iree/hal/local/inline_command_buffer.c b/runtime/src/iree/hal/local/inline_command_buffer.c
index 7ea8513..a28417f 100644
--- a/runtime/src/iree/hal/local/inline_command_buffer.c
+++ b/runtime/src/iree/hal/local/inline_command_buffer.c
@@ -285,12 +285,13 @@
 }
 
 //===----------------------------------------------------------------------===//
-// iree_hal_command_buffer_discard_buffer
+// iree_hal_command_buffer_advise_buffer
 //===----------------------------------------------------------------------===//
 
-static iree_status_t iree_hal_inline_command_buffer_discard_buffer(
+static iree_status_t iree_hal_inline_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   // Could be treated as a cache invalidation as it indicates we won't be using
   // the existing buffer contents again.
   return iree_ok_status();
@@ -303,7 +304,7 @@
 static iree_status_t iree_hal_inline_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   return iree_hal_buffer_map_fill(target_ref.buffer, target_ref.offset,
                                   target_ref.length, pattern, pattern_length);
 }
@@ -314,7 +315,8 @@
 
 static iree_status_t iree_hal_inline_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   return iree_hal_buffer_map_write(
       target_ref.buffer, target_ref.offset,
       (const uint8_t*)source_buffer + source_offset, target_ref.length);
@@ -326,7 +328,8 @@
 
 static iree_status_t iree_hal_inline_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   return iree_hal_buffer_map_copy(source_ref.buffer, source_ref.offset,
                                   target_ref.buffer, target_ref.offset,
                                   target_ref.length);
@@ -503,7 +506,7 @@
         .signal_event = iree_hal_inline_command_buffer_signal_event,
         .reset_event = iree_hal_inline_command_buffer_reset_event,
         .wait_events = iree_hal_inline_command_buffer_wait_events,
-        .discard_buffer = iree_hal_inline_command_buffer_discard_buffer,
+        .advise_buffer = iree_hal_inline_command_buffer_advise_buffer,
         .fill_buffer = iree_hal_inline_command_buffer_fill_buffer,
         .update_buffer = iree_hal_inline_command_buffer_update_buffer,
         .copy_buffer = iree_hal_inline_command_buffer_copy_buffer,
diff --git a/runtime/src/iree/hal/utils/debug_allocator.c b/runtime/src/iree/hal/utils/debug_allocator.c
index b0be6d8..6389d8b 100644
--- a/runtime/src/iree/hal/utils/debug_allocator.c
+++ b/runtime/src/iree/hal/utils/debug_allocator.c
@@ -170,7 +170,7 @@
     };
     status = iree_hal_device_queue_execute(
         device, IREE_HAL_QUEUE_AFFINITY_ANY, iree_hal_semaphore_list_empty(),
-        signal_list, 1, &command_buffer, /*binding_tables=*/NULL);
+        signal_list, command_buffer, iree_hal_buffer_binding_table_empty());
   }
 
   if (iree_status_is_ok(status)) {
diff --git a/runtime/src/iree/hal/utils/deferred_command_buffer.c b/runtime/src/iree/hal/utils/deferred_command_buffer.c
index 7206254..939cddc 100644
--- a/runtime/src/iree/hal/utils/deferred_command_buffer.c
+++ b/runtime/src/iree/hal/utils/deferred_command_buffer.c
@@ -18,7 +18,7 @@
   IREE_HAL_CMD_SIGNAL_EVENT,
   IREE_HAL_CMD_RESET_EVENT,
   IREE_HAL_CMD_WAIT_EVENTS,
-  IREE_HAL_CMD_DISCARD_BUFFER,
+  IREE_HAL_CMD_ADVISE_BUFFER,
   IREE_HAL_CMD_FILL_BUFFER,
   IREE_HAL_CMD_UPDATE_BUFFER,
   IREE_HAL_CMD_COPY_BUFFER,
@@ -433,17 +433,21 @@
 }
 
 //===----------------------------------------------------------------------===//
-// IREE_HAL_CMD_DISCARD_BUFFER
+// IREE_HAL_CMD_ADVISE_BUFFER
 //===----------------------------------------------------------------------===//
 
-typedef struct iree_hal_cmd_discard_buffer_t {
+typedef struct iree_hal_cmd_advise_buffer_t {
   iree_hal_cmd_header_t header;
   iree_hal_buffer_ref_t buffer_ref;
-} iree_hal_cmd_discard_buffer_t;
+  iree_hal_memory_advise_flags_t flags;
+  uint64_t arg0;
+  uint64_t arg1;
+} iree_hal_cmd_advise_buffer_t;
 
-static iree_status_t iree_hal_deferred_command_buffer_discard_buffer(
+static iree_status_t iree_hal_deferred_command_buffer_advise_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t buffer_ref) {
+    iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags,
+    uint64_t arg0, uint64_t arg1) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
@@ -451,22 +455,25 @@
     IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
         command_buffer->resource_set, 1, &buffer_ref.buffer));
   }
-  iree_hal_cmd_discard_buffer_t* cmd = NULL;
+  iree_hal_cmd_advise_buffer_t* cmd = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
-      cmd_list, IREE_HAL_CMD_DISCARD_BUFFER, sizeof(*cmd), (void**)&cmd));
+      cmd_list, IREE_HAL_CMD_ADVISE_BUFFER, sizeof(*cmd), (void**)&cmd));
   cmd->buffer_ref = buffer_ref;
+  cmd->flags = flags;
+  cmd->arg0 = arg0;
+  cmd->arg1 = arg1;
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_deferred_command_buffer_apply_discard_buffer(
+static iree_status_t iree_hal_deferred_command_buffer_apply_advise_buffer(
     iree_hal_command_buffer_t* target_command_buffer,
     iree_hal_buffer_binding_table_t binding_table,
-    const iree_hal_cmd_discard_buffer_t* cmd) {
+    const iree_hal_cmd_advise_buffer_t* cmd) {
   iree_hal_buffer_ref_t buffer_ref;
   IREE_RETURN_IF_ERROR(iree_hal_buffer_binding_table_resolve_ref(
       binding_table, cmd->buffer_ref, &buffer_ref));
-  return iree_hal_command_buffer_discard_buffer(target_command_buffer,
-                                                buffer_ref);
+  return iree_hal_command_buffer_advise_buffer(
+      target_command_buffer, buffer_ref, cmd->flags, cmd->arg0, cmd->arg1);
 }
 
 //===----------------------------------------------------------------------===//
@@ -478,12 +485,13 @@
   iree_hal_buffer_ref_t target_ref;
   uint64_t pattern;
   iree_host_size_t pattern_length;
+  iree_hal_fill_flags_t flags;
 } iree_hal_cmd_fill_buffer_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_buffer_ref_t target_ref, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
@@ -501,6 +509,7 @@
   cmd->target_ref = target_ref;
   memcpy(&cmd->pattern, pattern, pattern_length);
   cmd->pattern_length = pattern_length;
+  cmd->flags = flags;
   return iree_ok_status();
 }
 
@@ -513,7 +522,7 @@
       binding_table, cmd->target_ref, &target_ref));
   return iree_hal_command_buffer_fill_buffer(target_command_buffer, target_ref,
                                              (void**)&cmd->pattern,
-                                             cmd->pattern_length);
+                                             cmd->pattern_length, cmd->flags);
 }
 
 //===----------------------------------------------------------------------===//
@@ -523,12 +532,14 @@
 typedef struct iree_hal_cmd_update_buffer_t {
   iree_hal_cmd_header_t header;
   iree_hal_buffer_ref_t target_ref;
+  iree_hal_update_flags_t flags;
   uint8_t source_buffer[];
 } iree_hal_cmd_update_buffer_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref,
+    iree_hal_update_flags_t flags) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
@@ -542,6 +553,7 @@
       sizeof(*cmd) + sizeof(cmd->source_buffer[0]) * target_ref.length,
       (void**)&cmd));
   cmd->target_ref = target_ref;
+  cmd->flags = flags;
   memcpy(cmd->source_buffer, (const uint8_t*)source_buffer + source_offset,
          sizeof(cmd->source_buffer[0]) * target_ref.length);
   return iree_ok_status();
@@ -555,7 +567,7 @@
   IREE_RETURN_IF_ERROR(iree_hal_buffer_binding_table_resolve_ref(
       binding_table, cmd->target_ref, &target_ref));
   return iree_hal_command_buffer_update_buffer(
-      target_command_buffer, cmd->source_buffer, 0, target_ref);
+      target_command_buffer, cmd->source_buffer, 0, target_ref, cmd->flags);
 }
 
 //===----------------------------------------------------------------------===//
@@ -566,11 +578,13 @@
   iree_hal_cmd_header_t header;
   iree_hal_buffer_ref_t source_ref;
   iree_hal_buffer_ref_t target_ref;
+  iree_hal_copy_flags_t flags;
 } iree_hal_cmd_copy_buffer_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref,
+    iree_hal_copy_flags_t flags) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
@@ -591,6 +605,7 @@
       cmd_list, IREE_HAL_CMD_COPY_BUFFER, sizeof(*cmd), (void**)&cmd));
   cmd->source_ref = source_ref;
   cmd->target_ref = target_ref;
+  cmd->flags = flags;
   return iree_ok_status();
 }
 
@@ -605,7 +620,7 @@
   IREE_RETURN_IF_ERROR(iree_hal_buffer_binding_table_resolve_ref(
       binding_table, cmd->target_ref, &target_ref));
   return iree_hal_command_buffer_copy_buffer(target_command_buffer, source_ref,
-                                             target_ref);
+                                             target_ref, cmd->flags);
 }
 
 //===----------------------------------------------------------------------===//
@@ -832,8 +847,8 @@
         iree_hal_deferred_command_buffer_apply_reset_event,
     [IREE_HAL_CMD_WAIT_EVENTS] = (iree_hal_cmd_apply_fn_t)
         iree_hal_deferred_command_buffer_apply_wait_events,
-    [IREE_HAL_CMD_DISCARD_BUFFER] = (iree_hal_cmd_apply_fn_t)
-        iree_hal_deferred_command_buffer_apply_discard_buffer,
+    [IREE_HAL_CMD_ADVISE_BUFFER] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_advise_buffer,
     [IREE_HAL_CMD_FILL_BUFFER] = (iree_hal_cmd_apply_fn_t)
         iree_hal_deferred_command_buffer_apply_fill_buffer,
     [IREE_HAL_CMD_UPDATE_BUFFER] = (iree_hal_cmd_apply_fn_t)
@@ -894,7 +909,7 @@
         .signal_event = iree_hal_deferred_command_buffer_signal_event,
         .reset_event = iree_hal_deferred_command_buffer_reset_event,
         .wait_events = iree_hal_deferred_command_buffer_wait_events,
-        .discard_buffer = iree_hal_deferred_command_buffer_discard_buffer,
+        .advise_buffer = iree_hal_deferred_command_buffer_advise_buffer,
         .fill_buffer = iree_hal_deferred_command_buffer_fill_buffer,
         .update_buffer = iree_hal_deferred_command_buffer_update_buffer,
         .copy_buffer = iree_hal_deferred_command_buffer_copy_buffer,
diff --git a/runtime/src/iree/hal/utils/file_transfer.c b/runtime/src/iree/hal/utils/file_transfer.c
index 2bc8dec..193a2f5 100644
--- a/runtime/src/iree/hal/utils/file_transfer.c
+++ b/runtime/src/iree/hal/utils/file_transfer.c
@@ -524,7 +524,8 @@
         operation->device, operation->queue_affinity, wait_semaphore_list,
         signal_semaphore_list, operation->staging_buffer,
         worker->staging_buffer_offset, operation->buffer,
-        operation->buffer_offset + transfer_offset, transfer_length);
+        operation->buffer_offset + transfer_offset, transfer_length,
+        IREE_HAL_COPY_FLAG_NONE);
   }
 
   // Wait for the copy to complete and tick again if we expect there to be more
@@ -688,7 +689,7 @@
       operation->device, operation->queue_affinity, wait_semaphore_list,
       signal_semaphore_list, operation->buffer,
       operation->buffer_offset + transfer_offset, operation->staging_buffer,
-      worker->staging_buffer_offset, transfer_length);
+      worker->staging_buffer_offset, transfer_length, IREE_HAL_COPY_FLAG_NONE);
 
   // Wait for the copy to complete so we can write it to the file.
   if (iree_status_is_ok(status)) {
@@ -860,7 +861,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags,
+    iree_device_size_t length, iree_hal_read_flags_t flags,
     iree_hal_file_transfer_options_t options) {
   IREE_RETURN_IF_ERROR(
       iree_hal_file_validate_access(source_file, IREE_HAL_MEMORY_ACCESS_READ));
@@ -872,7 +873,7 @@
     return iree_hal_device_queue_copy(
         device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
         storage_buffer, (iree_device_size_t)source_offset, target_buffer,
-        target_offset, length);
+        target_offset, length, IREE_HAL_COPY_FLAG_NONE);
   }
 
   // Allocate full transfer operation.
@@ -900,7 +901,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags,
+    iree_device_size_t length, iree_hal_write_flags_t flags,
     iree_hal_file_transfer_options_t options) {
   // EXPERIMENTAL: assume memory files only today (as that's all we have).
   IREE_RETURN_IF_ERROR(
@@ -913,7 +914,7 @@
     return iree_hal_device_queue_copy(
         device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
         source_buffer, source_offset, storage_buffer,
-        (iree_device_size_t)target_offset, length);
+        (iree_device_size_t)target_offset, length, IREE_HAL_COPY_FLAG_NONE);
   }
 
   // Allocate full transfer operation.
diff --git a/runtime/src/iree/hal/utils/file_transfer.h b/runtime/src/iree/hal/utils/file_transfer.h
index cf85099..ece694b 100644
--- a/runtime/src/iree/hal/utils/file_transfer.h
+++ b/runtime/src/iree/hal/utils/file_transfer.h
@@ -60,7 +60,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, uint32_t flags,
+    iree_device_size_t length, iree_hal_read_flags_t flags,
     iree_hal_file_transfer_options_t options);
 
 // EXPERIMENTAL: eventually we'll focus this only on emulating support where
@@ -83,7 +83,7 @@
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
-    iree_device_size_t length, uint32_t flags,
+    iree_device_size_t length, iree_hal_write_flags_t flags,
     iree_hal_file_transfer_options_t options);
 
 #ifdef __cplusplus
diff --git a/runtime/src/iree/io/parameter_index_provider.c b/runtime/src/iree/io/parameter_index_provider.c
index 75ed9c5..9e9a5f4 100644
--- a/runtime/src/iree/io/parameter_index_provider.c
+++ b/runtime/src/iree/io/parameter_index_provider.c
@@ -512,7 +512,7 @@
       z0, iree_hal_command_buffer_fill_buffer(
               batch->transfer_command_buffer,
               iree_hal_make_buffer_ref(buffer, buffer_offset, length), pattern,
-              pattern_length));
+              pattern_length, IREE_HAL_FILL_FLAG_NONE));
 
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
@@ -523,7 +523,7 @@
     iree_io_parameter_op_batch_t* batch, iree_hal_file_t* source_file,
     uint64_t source_file_offset, iree_hal_buffer_t* target_buffer,
     iree_device_size_t target_buffer_offset, iree_device_size_t length,
-    uint32_t flags) {
+    iree_hal_read_flags_t flags) {
   IREE_ASSERT_ARGUMENT(batch);
   IREE_ASSERT_ARGUMENT(source_file);
   IREE_ASSERT_ARGUMENT(target_buffer);
@@ -546,7 +546,8 @@
 static iree_status_t iree_io_parameter_op_batch_enqueue_file_write(
     iree_io_parameter_op_batch_t* batch, iree_hal_buffer_t* source_buffer,
     iree_device_size_t source_buffer_offset, iree_hal_file_t* target_file,
-    uint64_t target_file_offset, iree_device_size_t length, uint32_t flags) {
+    uint64_t target_file_offset, iree_device_size_t length,
+    iree_hal_write_flags_t flags) {
   IREE_ASSERT_ARGUMENT(batch);
   IREE_ASSERT_ARGUMENT(source_buffer);
   IREE_ASSERT_ARGUMENT(target_file);
@@ -591,8 +592,8 @@
     if (iree_status_is_ok(status)) {
       status = iree_hal_device_queue_execute(
           batch->device, batch->queue_affinity, step.wait_semaphore_list,
-          step.signal_semaphore_list, 1, &batch->transfer_command_buffer,
-          /*binding_tables=*/NULL);
+          step.signal_semaphore_list, batch->transfer_command_buffer,
+          iree_hal_buffer_binding_table_empty());
     }
     IREE_TRACE_ZONE_END(z_transfer);
   }
diff --git a/runtime/src/iree/modules/check/module.cc b/runtime/src/iree/modules/check/module.cc
index 0dfe9e1..9ffd6cc 100644
--- a/runtime/src/iree/modules/check/module.cc
+++ b/runtime/src/iree/modules/check/module.cc
@@ -207,7 +207,8 @@
     IREE_RETURN_IF_ERROR(iree_hal_command_buffer_copy_buffer(
         command_buffer.get(),
         iree_hal_make_buffer_ref(source_buffer, 0, buffer_length),
-        iree_hal_make_buffer_ref(target_buffer.get(), 0, buffer_length)));
+        iree_hal_make_buffer_ref(target_buffer.get(), 0, buffer_length),
+        IREE_HAL_COPY_FLAG_NONE));
     vm::ref<iree_hal_buffer_view_t> target_view;
     IREE_RETURN_IF_ERROR(iree_hal_buffer_view_create_like(
         target_buffer.get(), source_views[i].get(),
@@ -224,8 +225,8 @@
       semaphore.get(), 1ull, iree_hal_device_host_allocator(device), &fence));
   IREE_RETURN_IF_ERROR(iree_hal_device_queue_execute(
       device, IREE_HAL_QUEUE_AFFINITY_ANY, iree_hal_semaphore_list_empty(),
-      iree_hal_fence_semaphore_list(fence.get()), 1, &command_buffer,
-      /*binding_tables=*/NULL));
+      iree_hal_fence_semaphore_list(fence.get()), command_buffer.get(),
+      iree_hal_buffer_binding_table_empty()));
   IREE_RETURN_IF_ERROR(
       iree_hal_fence_wait(fence.get(), iree_infinite_timeout()));
   return std::move(target_views);
diff --git a/runtime/src/iree/modules/hal/module.c b/runtime/src/iree/modules/hal/module.c
index 38a95c6..2aac43f 100644
--- a/runtime/src/iree/modules/hal/module.c
+++ b/runtime/src/iree/modules/hal/module.c
@@ -834,9 +834,9 @@
       iree_hal_buffer_check_deref_or_null(args->r1, &target_ref.buffer));
   uint32_t pattern = (uint32_t)args->i5;
   uint32_t pattern_length = (uint32_t)args->i6;
-
+  iree_hal_fill_flags_t flags = IREE_HAL_FILL_FLAG_NONE;
   return iree_hal_command_buffer_fill_buffer(command_buffer, target_ref,
-                                             &pattern, pattern_length);
+                                             &pattern, pattern_length, flags);
 }
 
 IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_update_buffer,  //
@@ -855,13 +855,13 @@
       target_buffer_slot, target_offset, length);
   IREE_RETURN_IF_ERROR(
       iree_hal_buffer_check_deref_or_null(args->r3, &target_ref.buffer));
-
   iree_const_byte_span_t source_span = iree_const_byte_span_empty();
   IREE_RETURN_IF_ERROR(iree_vm_buffer_map_ro(
       source_buffer, source_offset, (iree_host_size_t)length, 1, &source_span));
-
+  iree_hal_update_flags_t flags = IREE_HAL_UPDATE_FLAG_NONE;
   return iree_hal_command_buffer_update_buffer(command_buffer, source_span.data,
-                                               /*source_offset=*/0, target_ref);
+                                               /*source_offset=*/0, target_ref,
+                                               flags);
 }
 
 IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_copy_buffer,  //
@@ -883,9 +883,9 @@
       iree_hal_buffer_check_deref_or_null(args->r3, &source_ref.buffer));
   IREE_RETURN_IF_ERROR(
       iree_hal_buffer_check_deref_or_null(args->r5, &target_ref.buffer));
-
+  iree_hal_copy_flags_t flags = IREE_HAL_COPY_FLAG_NONE;
   return iree_hal_command_buffer_copy_buffer(command_buffer, source_ref,
-                                             target_ref);
+                                             target_ref, flags);
 }
 
 IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_collective,  //
@@ -911,7 +911,6 @@
   IREE_RETURN_IF_ERROR(
       iree_hal_buffer_check_deref_or_null(args->r7, &recv_ref.buffer));
   iree_device_size_t element_count = iree_hal_cast_device_size(args->i12);
-
   return iree_hal_command_buffer_collective(command_buffer, channel, op, param,
                                             send_ref, recv_ref, element_count);
 }
@@ -1215,7 +1214,7 @@
   IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r6, &target_buffer));
   iree_device_size_t target_offset = iree_hal_cast_device_size(args->i7);
   iree_device_size_t length = iree_hal_cast_device_size(args->i8);
-  uint32_t flags = (uint32_t)args->i9;
+  iree_hal_read_flags_t flags = (iree_hal_read_flags_t)args->i9;
   return iree_hal_device_queue_read(
       device, queue_affinity, iree_hal_fence_semaphore_list(wait_fence),
       iree_hal_fence_semaphore_list(signal_fence), source_file, source_offset,
@@ -1238,7 +1237,7 @@
   IREE_RETURN_IF_ERROR(iree_hal_file_check_deref(args->r6, &target_file));
   uint64_t target_offset = (uint64_t)args->i7;
   iree_device_size_t length = iree_hal_cast_device_size(args->i8);
-  uint32_t flags = (uint32_t)args->i9;
+  iree_hal_write_flags_t flags = (iree_hal_write_flags_t)args->i9;
   return iree_hal_device_queue_write(
       device, queue_affinity, iree_hal_fence_semaphore_list(wait_fence),
       iree_hal_fence_semaphore_list(signal_fence), source_buffer, source_offset,
@@ -1258,10 +1257,15 @@
   iree_hal_command_buffer_t** command_buffers = NULL;
   IREE_VM_ABI_VLA_STACK_DEREF(args, a4_count, a4, iree_hal_command_buffer, 32,
                               &command_buffer_count, &command_buffers);
+  if (command_buffer_count > 1) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "only zero or one command buffer is allowed");
+  }
   return iree_hal_device_queue_execute(
       device, queue_affinity, iree_hal_fence_semaphore_list(wait_fence),
-      iree_hal_fence_semaphore_list(signal_fence), command_buffer_count,
-      command_buffers, /*binding_tables=*/NULL);
+      iree_hal_fence_semaphore_list(signal_fence),
+      command_buffer_count > 0 ? command_buffers[0] : NULL,
+      iree_hal_buffer_binding_table_empty());
 }
 
 IREE_VM_ABI_EXPORT(iree_hal_module_device_queue_execute_indirect,  //
@@ -1313,8 +1317,8 @@
     };
     status = iree_hal_device_queue_execute(
         device, queue_affinity, iree_hal_fence_semaphore_list(wait_fence),
-        iree_hal_fence_semaphore_list(signal_fence), 1, &command_buffer,
-        &binding_table);
+        iree_hal_fence_semaphore_list(signal_fence), command_buffer,
+        binding_table);
   }
 
   // If we had to heap-allocate the binding table storage it must be freed
diff --git a/runtime/src/iree/tooling/function_util.c b/runtime/src/iree/tooling/function_util.c
index a21b6c5..0e2d38d 100644
--- a/runtime/src/iree/tooling/function_util.c
+++ b/runtime/src/iree/tooling/function_util.c
@@ -87,7 +87,8 @@
       iree_hal_make_buffer_ref(source_buffer, 0,
                                iree_hal_buffer_byte_length(source_buffer)),
       iree_hal_make_buffer_ref(target_buffer, 0,
-                               iree_hal_buffer_byte_length(source_buffer)));
+                               iree_hal_buffer_byte_length(source_buffer)),
+      IREE_HAL_COPY_FLAG_NONE);
 
   if (iree_status_is_ok(status)) {
     *out_target_buffer = target_buffer;
@@ -122,8 +123,8 @@
   if (iree_status_is_ok(status)) {
     status = iree_hal_device_queue_execute(
         device, queue_affinity, iree_hal_fence_semaphore_list(wait_fence),
-        iree_hal_fence_semaphore_list(signal_fence), 1, &command_buffer,
-        /*binding_tables=*/NULL);
+        iree_hal_fence_semaphore_list(signal_fence), command_buffer,
+        iree_hal_buffer_binding_table_empty());
   }
 
   if (iree_status_is_ok(status) && needs_wait) {
diff --git a/tools/iree-benchmark-executable-main.c b/tools/iree-benchmark-executable-main.c
index f5cfb4a..d769959 100644
--- a/tools/iree-benchmark-executable-main.c
+++ b/tools/iree-benchmark-executable-main.c
@@ -255,7 +255,8 @@
     ++fence_value;
     IREE_RETURN_IF_ERROR(iree_hal_device_queue_execute(
         args->device, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
-        signal_semaphore_list, 1, &command_buffer, /*binding_tables=*/NULL));
+        signal_semaphore_list, command_buffer,
+        iree_hal_buffer_binding_table_empty()));
 
     // Block and wait for the submission to complete.
     // Note that this will include round-trip overhead and if the dispatch or