Retaining binding tables and plumbing indirect cmds in local-task. (#17838)

The task system doesn't currently support resubmission of task graphs
(as scheduling the graphs happens in-place in the task graph structure)
so it is emulated as with a deferred command buffer as with other
targets. Async targets need to retain the binding table contents until
the submission completes (as the resource set in the command buffer
won't) and a helper was added to allow that to be done efficiently in
the future (today it's still slow).
diff --git a/experimental/rocm/direct_command_buffer.c b/experimental/rocm/direct_command_buffer.c
index 80fade6..fe165e5 100644
--- a/experimental/rocm/direct_command_buffer.c
+++ b/experimental/rocm/direct_command_buffer.c
@@ -52,7 +52,8 @@
 }
 
 iree_status_t iree_hal_rocm_direct_command_buffer_create(
-    iree_hal_device_t* device, iree_hal_rocm_context_wrapper_t* context,
+    iree_hal_allocator_t* device_allocator,
+    iree_hal_rocm_context_wrapper_t* context,
     iree_hal_rocm_tracing_context_t* tracing_context,
     iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
@@ -82,8 +83,8 @@
       (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
-        device, mode, command_categories, queue_affinity, binding_capacity,
-        (uint8_t*)command_buffer + total_size,
+        device_allocator, mode, command_categories, queue_affinity,
+        binding_capacity, (uint8_t*)command_buffer + total_size,
         &iree_hal_rocm_direct_command_buffer_vtable, &command_buffer->base);
     command_buffer->context = context;
     command_buffer->tracing_context = tracing_context;
diff --git a/experimental/rocm/direct_command_buffer.h b/experimental/rocm/direct_command_buffer.h
index 45617a1..9d6f5ae 100644
--- a/experimental/rocm/direct_command_buffer.h
+++ b/experimental/rocm/direct_command_buffer.h
@@ -34,7 +34,8 @@
 
 // Creates a rocm direct command buffer.
 iree_status_t iree_hal_rocm_direct_command_buffer_create(
-    iree_hal_device_t* device, iree_hal_rocm_context_wrapper_t* context,
+    iree_hal_allocator_t* device_allocator,
+    iree_hal_rocm_context_wrapper_t* context,
     iree_hal_rocm_tracing_context_t* tracing_context,
     iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
diff --git a/experimental/rocm/rocm_device.c b/experimental/rocm/rocm_device.c
index fabc2f7..697ab1f 100644
--- a/experimental/rocm/rocm_device.c
+++ b/experimental/rocm/rocm_device.c
@@ -238,9 +238,9 @@
     iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_rocm_device_t* device = iree_hal_rocm_device_cast(base_device);
   return iree_hal_rocm_direct_command_buffer_create(
-      base_device, &device->context_wrapper, device->tracing_context, mode,
-      command_categories, queue_affinity, binding_capacity, &device->block_pool,
-      out_command_buffer);
+      iree_hal_device_allocator(base_device), &device->context_wrapper,
+      device->tracing_context, mode, command_categories, queue_affinity,
+      binding_capacity, &device->block_pool, out_command_buffer);
 }
 
 static iree_status_t iree_hal_rocm_device_create_descriptor_set_layout(
diff --git a/experimental/webgpu/command_buffer.c b/experimental/webgpu/command_buffer.c
index 04ad6ee..d57dee4 100644
--- a/experimental/webgpu/command_buffer.c
+++ b/experimental/webgpu/command_buffer.c
@@ -180,7 +180,7 @@
 }
 
 iree_status_t iree_hal_webgpu_command_buffer_create(
-    iree_hal_device_t* device, WGPUDevice device_handle,
+    iree_hal_allocator_t* device_allocator, WGPUDevice device_handle,
     iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
@@ -189,7 +189,7 @@
     iree_hal_webgpu_bind_group_cache_t* bind_group_cache,
     iree_hal_webgpu_builtins_t* builtins, iree_allocator_t host_allocator,
     iree_hal_command_buffer_t** out_command_buffer) {
-  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(device_allocator);
   IREE_ASSERT_ARGUMENT(block_pool);
   IREE_ASSERT_ARGUMENT(staging_buffer);
   IREE_ASSERT_ARGUMENT(bind_group_cache);
@@ -213,8 +213,8 @@
       (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
-        device, mode, command_categories, queue_affinity, binding_capacity,
-        (uint8_t*)command_buffer + sizeof(*command_buffer),
+        device_allocator, mode, command_categories, queue_affinity,
+        binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
         &iree_hal_webgpu_command_buffer_vtable, &command_buffer->base);
     command_buffer->host_allocator = host_allocator;
     command_buffer->device = device_handle;
diff --git a/experimental/webgpu/command_buffer.h b/experimental/webgpu/command_buffer.h
index 2cc5780..efcc186 100644
--- a/experimental/webgpu/command_buffer.h
+++ b/experimental/webgpu/command_buffer.h
@@ -20,7 +20,7 @@
 #endif  // __cplusplus
 
 iree_status_t iree_hal_webgpu_command_buffer_create(
-    iree_hal_device_t* device, WGPUDevice device_handle,
+    iree_hal_allocator_t* device_allocator, WGPUDevice device_handle,
     iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
diff --git a/experimental/webgpu/webgpu_device.c b/experimental/webgpu/webgpu_device.c
index e3246e0..70cf439 100644
--- a/experimental/webgpu/webgpu_device.c
+++ b/experimental/webgpu/webgpu_device.c
@@ -243,10 +243,11 @@
     iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
   return iree_hal_webgpu_command_buffer_create(
-      (iree_hal_device_t*)device, device->handle, mode, command_categories,
-      queue_affinity, binding_capacity, &device->large_block_pool,
-      &device->staging_buffer, &device->bind_group_cache, &device->builtins,
-      device->host_allocator, out_command_buffer);
+      iree_hal_device_allocator(base_device), device->handle, mode,
+      command_categories, queue_affinity, binding_capacity,
+      &device->large_block_pool, &device->staging_buffer,
+      &device->bind_group_cache, &device->builtins, device->host_allocator,
+      out_command_buffer);
 }
 
 static iree_status_t iree_hal_webgpu_device_create_descriptor_set_layout(
diff --git a/runtime/src/iree/hal/command_buffer.c b/runtime/src/iree/hal/command_buffer.c
index 7da7174..38619a1 100644
--- a/runtime/src/iree/hal/command_buffer.c
+++ b/runtime/src/iree/hal/command_buffer.c
@@ -183,7 +183,7 @@
 }
 
 IREE_API_EXPORT void iree_hal_command_buffer_initialize(
-    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     void* validation_state, const iree_hal_command_buffer_vtable_t* vtable,
@@ -213,7 +213,7 @@
   // implementation.
   IF_VALIDATING(command_buffer, {
     iree_hal_command_buffer_initialize_validation(
-        device, command_buffer, VALIDATION_STATE(command_buffer));
+        device_allocator, command_buffer, VALIDATION_STATE(command_buffer));
   });
 }
 
diff --git a/runtime/src/iree/hal/command_buffer.h b/runtime/src/iree/hal/command_buffer.h
index 49c516e..fb849ab 100644
--- a/runtime/src/iree/hal/command_buffer.h
+++ b/runtime/src/iree/hal/command_buffer.h
@@ -904,7 +904,7 @@
     iree_hal_command_buffer_mode_t mode, iree_host_size_t binding_capacity);
 
 IREE_API_EXPORT void iree_hal_command_buffer_initialize(
-    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     void* validation_state, const iree_hal_command_buffer_vtable_t* vtable,
diff --git a/runtime/src/iree/hal/command_buffer_validation.c b/runtime/src/iree/hal/command_buffer_validation.c
index 6ae820a..9086303 100644
--- a/runtime/src/iree/hal/command_buffer_validation.c
+++ b/runtime/src/iree/hal/command_buffer_validation.c
@@ -14,7 +14,6 @@
 #include "iree/hal/allocator.h"
 #include "iree/hal/buffer.h"
 #include "iree/hal/detail.h"
-#include "iree/hal/device.h"
 #include "iree/hal/event.h"
 #include "iree/hal/executable.h"
 #include "iree/hal/pipeline_layout.h"
@@ -60,7 +59,7 @@
     iree_hal_buffer_usage_t intended_usage) {
   iree_hal_buffer_compatibility_t allowed_compatibility =
       iree_hal_allocator_query_buffer_compatibility(
-          iree_hal_device_allocator(validation_state->device),
+          validation_state->device_allocator,
           (iree_hal_buffer_params_t){
               .type = iree_hal_buffer_memory_type(buffer),
               .usage = iree_hal_buffer_allowed_usage(buffer) & intended_usage,
@@ -208,9 +207,10 @@
 }
 
 void iree_hal_command_buffer_initialize_validation(
-    iree_hal_device_t* device, iree_hal_command_buffer_t* command_buffer,
+    iree_hal_allocator_t* device_allocator,
+    iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* out_validation_state) {
-  out_validation_state->device = device;
+  out_validation_state->device_allocator = device_allocator;
   out_validation_state->is_recording = false;
   out_validation_state->debug_group_depth = 0;
 }
diff --git a/runtime/src/iree/hal/command_buffer_validation.h b/runtime/src/iree/hal/command_buffer_validation.h
index 4a40b49..036d666 100644
--- a/runtime/src/iree/hal/command_buffer_validation.h
+++ b/runtime/src/iree/hal/command_buffer_validation.h
@@ -27,7 +27,9 @@
 // Storage for command buffer validation state.
 // Designed to be embedded in concrete implementations that want validation.
 typedef struct iree_hal_command_buffer_validation_state_t {
-  iree_hal_device_t* device;
+  // Allocator from the device the command buffer is targeting.
+  // Used to verify buffer compatibility.
+  iree_hal_allocator_t* device_allocator;
   // 1 when in a begin/end recording sequence.
   int32_t is_recording : 1;
   // Debug group depth for tracking proper begin/end pairing.
@@ -41,7 +43,8 @@
 } iree_hal_command_buffer_validation_state_t;
 
 void iree_hal_command_buffer_initialize_validation(
-    iree_hal_device_t* device, iree_hal_command_buffer_t* command_buffer,
+    iree_hal_allocator_t* device_allocator,
+    iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* out_validation_state);
 
 iree_status_t iree_hal_command_buffer_begin_validation(
diff --git a/runtime/src/iree/hal/device.h b/runtime/src/iree/hal/device.h
index 13c8263..2f52d54 100644
--- a/runtime/src/iree/hal/device.h
+++ b/runtime/src/iree/hal/device.h
@@ -154,32 +154,6 @@
 };
 typedef uint32_t iree_hal_semaphore_compatibility_t;
 
-// A single batch of command buffers submitted to a device queue.
-// All of the wait semaphores must reach or exceed the given payload value prior
-// to the batch beginning execution. Each command buffer begins execution in the
-// order it is present in the list, though note that the command buffers
-// execute concurrently and require internal synchronization via events if there
-// are any dependencies between them. Only after all command buffers have
-// completed will the signal semaphores be updated to the provided payload
-// values.
-//
-// Matches Vulkan's VkSubmitInfo:
-// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkSubmitInfo.html
-// Note that as the HAL only models timeline semaphores we take the payload
-// values directly in this struct; see:
-// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkTimelineSemaphoreSubmitInfo.html
-typedef struct iree_hal_submission_batch_t {
-  // Semaphores to wait on prior to executing any command buffer.
-  iree_hal_semaphore_list_t wait_semaphores;
-
-  // Command buffers to execute, in order.
-  iree_host_size_t command_buffer_count;
-  iree_hal_command_buffer_t* const* command_buffers;
-
-  // Semaphores to signal once all command buffers have completed execution.
-  iree_hal_semaphore_list_t signal_semaphores;
-} iree_hal_submission_batch_t;
-
 // Defines how a multi-wait operation treats the results of multiple semaphores.
 typedef enum iree_hal_wait_mode_e {
   // Waits for all semaphores to reach or exceed their specified values.
diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
index c891d18..a78dc6f 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
@@ -527,10 +527,10 @@
     iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
   return iree_hal_cuda_stream_command_buffer_create(
-      base_device, device->cuda_symbols, device->nccl_symbols,
-      device->tracing_context, mode, command_categories, binding_capacity,
-      device->dispatch_cu_stream, &device->block_pool, device->host_allocator,
-      out_command_buffer);
+      iree_hal_device_allocator(base_device), device->cuda_symbols,
+      device->nccl_symbols, device->tracing_context, mode, command_categories,
+      binding_capacity, device->dispatch_cu_stream, &device->block_pool,
+      device->host_allocator, out_command_buffer);
 }
 
 static iree_status_t iree_hal_cuda_device_create_command_buffer(
@@ -547,22 +547,22 @@
       // command buffers.
       if (binding_capacity > 0) {
         return iree_hal_deferred_command_buffer_create(
-            base_device, mode, command_categories, binding_capacity,
-            &device->block_pool, iree_hal_device_host_allocator(base_device),
-            out_command_buffer);
+            iree_hal_device_allocator(base_device), mode, command_categories,
+            binding_capacity, &device->block_pool,
+            iree_hal_device_host_allocator(base_device), out_command_buffer);
       } else {
         return iree_hal_cuda_graph_command_buffer_create(
-            base_device, device->cuda_symbols, device->tracing_context,
-            device->cu_context, mode, command_categories, queue_affinity,
-            binding_capacity, &device->block_pool, device->host_allocator,
-            out_command_buffer);
+            iree_hal_device_allocator(base_device), device->cuda_symbols,
+            device->tracing_context, device->cu_context, mode,
+            command_categories, queue_affinity, binding_capacity,
+            &device->block_pool, device->host_allocator, out_command_buffer);
       }
     }
     case IREE_HAL_CUDA_COMMAND_BUFFER_MODE_STREAM: {
       return iree_hal_deferred_command_buffer_create(
-          base_device, mode, command_categories, binding_capacity,
-          &device->block_pool, iree_hal_device_host_allocator(base_device),
-          out_command_buffer);
+          iree_hal_device_allocator(base_device), mode, command_categories,
+          binding_capacity, &device->block_pool,
+          iree_hal_device_host_allocator(base_device), out_command_buffer);
     }
     default: {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
index de2de99..3350c5c 100644
--- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
@@ -155,7 +155,7 @@
 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
 iree_status_t iree_hal_cuda_graph_command_buffer_create(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     const iree_hal_cuda_dynamic_symbols_t* cuda_symbols,
     iree_hal_cuda_tracing_context_t* tracing_context, CUcontext context,
     iree_hal_command_buffer_mode_t mode,
@@ -185,8 +185,8 @@
                             (void**)&command_buffer));
 
   iree_hal_command_buffer_initialize(
-      device, mode, command_categories, queue_affinity, binding_capacity,
-      (uint8_t*)command_buffer + sizeof(*command_buffer),
+      device_allocator, mode, command_categories, queue_affinity,
+      binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
       &iree_hal_cuda_graph_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
   command_buffer->symbols = cuda_symbols;
diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h
index d310fd1..e6e3c85 100644
--- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h
+++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.h
@@ -25,7 +25,7 @@
 // input data until reset. It must remain live for the lifetime of the command
 // buffers that use it.
 iree_status_t iree_hal_cuda_graph_command_buffer_create(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     const iree_hal_cuda_dynamic_symbols_t* cuda_symbols,
     iree_hal_cuda_tracing_context_t* tracing_context, CUcontext context,
     iree_hal_command_buffer_mode_t mode,
diff --git a/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c b/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c
index 33d2d02..bc11619 100644
--- a/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c
+++ b/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c
@@ -580,6 +580,8 @@
     status = iree_hal_resource_set_insert(
         action->resource_set, command_buffer_count, command_buffers);
   }
+  // TODO(indirect-cmd): clone binding table contents and add to resource set.
+
   if (iree_status_is_ok(status)) {
     // Retain the owning queue to make sure the action outlives it.
     iree_hal_resource_retain(actions);
diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
index 6533cc3..fcc13ef 100644
--- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
@@ -58,7 +58,7 @@
 }
 
 iree_status_t iree_hal_cuda_stream_command_buffer_create(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     const iree_hal_cuda_dynamic_symbols_t* cuda_symbols,
     const iree_hal_cuda_nccl_dynamic_symbols_t* nccl_symbols,
     iree_hal_cuda_tracing_context_t* tracing_context,
@@ -67,7 +67,7 @@
     iree_host_size_t binding_capacity, CUstream stream,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_command_buffer_t** out_command_buffer) {
-  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(device_allocator);
   IREE_ASSERT_ARGUMENT(cuda_symbols);
   IREE_ASSERT_ARGUMENT(nccl_symbols);
   IREE_ASSERT_ARGUMENT(out_command_buffer);
@@ -91,7 +91,7 @@
                             (void**)&command_buffer));
 
   iree_hal_command_buffer_initialize(
-      device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
+      device_allocator, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
       binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
       &iree_hal_cuda_stream_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h
index 6f7383b..5ab4a11 100644
--- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h
+++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.h
@@ -30,7 +30,7 @@
 // retained by the source deferred command buffer and as such the |block_pool|
 // and can be NULL to avoid a double copy.
 iree_status_t iree_hal_cuda_stream_command_buffer_create(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     const iree_hal_cuda_dynamic_symbols_t* cuda_symbols,
     const iree_hal_cuda_nccl_dynamic_symbols_t* nccl_symbols,
     iree_hal_cuda_tracing_context_t* tracing_context,
diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
index 7ba006c..bae70fb 100644
--- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
@@ -156,7 +156,7 @@
 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
 iree_status_t iree_hal_hip_graph_command_buffer_create(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     const iree_hal_hip_dynamic_symbols_t* hip_symbols,
     iree_hal_hip_tracing_context_t* tracing_context, hipCtx_t context,
     iree_hal_command_buffer_mode_t mode,
@@ -164,7 +164,7 @@
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_command_buffer_t** out_command_buffer) {
-  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(device_allocator);
   IREE_ASSERT_ARGUMENT(hip_symbols);
   IREE_ASSERT_ARGUMENT(block_pool);
   IREE_ASSERT_ARGUMENT(out_command_buffer);
@@ -188,8 +188,8 @@
                             (void**)&command_buffer));
 
   iree_hal_command_buffer_initialize(
-      device, mode, command_categories, queue_affinity, binding_capacity,
-      (uint8_t*)command_buffer + sizeof(*command_buffer),
+      device_allocator, mode, command_categories, queue_affinity,
+      binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
       &iree_hal_hip_graph_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
   command_buffer->symbols = hip_symbols;
diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h
index d5393bc..7235d38 100644
--- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h
+++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.h
@@ -28,7 +28,7 @@
 // NOTE: the |block_pool| must remain live for the lifetime of the command
 // buffers that use it.
 iree_status_t iree_hal_hip_graph_command_buffer_create(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     const iree_hal_hip_dynamic_symbols_t* hip_symbols,
     iree_hal_hip_tracing_context_t* tracing_context, hipCtx_t context,
     iree_hal_command_buffer_mode_t mode,
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index dfd1b20..6a10fcb 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -522,10 +522,10 @@
     iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_hip_device_t* device = iree_hal_hip_device_cast(base_device);
   return iree_hal_hip_stream_command_buffer_create(
-      base_device, device->hip_symbols, device->nccl_symbols,
-      device->tracing_context, mode, command_categories, binding_capacity,
-      device->hip_dispatch_stream, &device->block_pool, device->host_allocator,
-      out_command_buffer);
+      iree_hal_device_allocator(base_device), device->hip_symbols,
+      device->nccl_symbols, device->tracing_context, mode, command_categories,
+      binding_capacity, device->hip_dispatch_stream, &device->block_pool,
+      device->host_allocator, out_command_buffer);
 }
 
 static iree_status_t iree_hal_hip_device_create_command_buffer(
@@ -542,23 +542,23 @@
     // need to be persisted. This lets us lower the execution delay as we can
     // directly route commands to a HIP stream and let it eagerly flush.
     return iree_hal_hip_stream_command_buffer_create(
-        base_device, device->hip_symbols, device->nccl_symbols,
-        device->tracing_context, mode, command_categories, binding_capacity,
-        device->hip_dispatch_stream, &device->block_pool,
+        iree_hal_device_allocator(base_device), device->hip_symbols,
+        device->nccl_symbols, device->tracing_context, mode, command_categories,
+        binding_capacity, device->hip_dispatch_stream, &device->block_pool,
         device->host_allocator, out_command_buffer);
   }
   switch (device->params.command_buffer_mode) {
     case IREE_HAL_HIP_COMMAND_BUFFER_MODE_GRAPH:
       return iree_hal_hip_graph_command_buffer_create(
-          base_device, device->hip_symbols, device->tracing_context,
-          device->hip_context, mode, command_categories, queue_affinity,
-          binding_capacity, &device->block_pool, device->host_allocator,
-          out_command_buffer);
+          iree_hal_device_allocator(base_device), device->hip_symbols,
+          device->tracing_context, device->hip_context, mode,
+          command_categories, queue_affinity, binding_capacity,
+          &device->block_pool, device->host_allocator, out_command_buffer);
     case IREE_HAL_HIP_COMMAND_BUFFER_MODE_STREAM:
       return iree_hal_deferred_command_buffer_create(
-          base_device, mode, command_categories, binding_capacity,
-          &device->block_pool, iree_hal_device_host_allocator(base_device),
-          out_command_buffer);
+          iree_hal_device_allocator(base_device), mode, command_categories,
+          binding_capacity, &device->block_pool,
+          iree_hal_device_host_allocator(base_device), out_command_buffer);
     default:
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "invalid command buffer mode");
diff --git a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
index c225c17..b91b6b4 100644
--- a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
+++ b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
@@ -581,6 +581,8 @@
     status = iree_hal_resource_set_insert(
         action->resource_set, command_buffer_count, command_buffers);
   }
+  // TODO(indirect-cmd): clone binding table contents and add to resource set.
+
   if (iree_status_is_ok(status)) {
     // Retain the owning queue to make sure the action outlives it.
     iree_hal_resource_retain(actions);
diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
index cedca50..ede6d8c 100644
--- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
@@ -58,7 +58,7 @@
 }
 
 iree_status_t iree_hal_hip_stream_command_buffer_create(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     const iree_hal_hip_dynamic_symbols_t* hip_symbols,
     const iree_hal_hip_nccl_dynamic_symbols_t* nccl_symbols,
     iree_hal_hip_tracing_context_t* tracing_context,
@@ -67,7 +67,7 @@
     iree_host_size_t binding_capacity, hipStream_t stream,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_command_buffer_t** out_command_buffer) {
-  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(device_allocator);
   IREE_ASSERT_ARGUMENT(hip_symbols);
   IREE_ASSERT_ARGUMENT(nccl_symbols);
   IREE_ASSERT_ARGUMENT(out_command_buffer);
@@ -91,7 +91,7 @@
                             (void**)&command_buffer));
 
   iree_hal_command_buffer_initialize(
-      device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
+      device_allocator, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
       binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
       &iree_hal_hip_stream_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h
index 3b79ddd..e6a1dfc 100644
--- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h
+++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.h
@@ -30,7 +30,7 @@
 // retained by the source deferred command buffer and as such the |block_pool|
 // and can be NULL to avoid a double copy.
 iree_status_t iree_hal_hip_stream_command_buffer_create(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     const iree_hal_hip_dynamic_symbols_t* hip_symbols,
     const iree_hal_hip_nccl_dynamic_symbols_t* nccl_symbols,
     iree_hal_hip_tracing_context_t* tracing_context,
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_device.c b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
index 7e6323a..e3a384c 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_device.c
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
@@ -235,13 +235,15 @@
   if (iree_all_bits_set(mode,
                         IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) {
     return iree_hal_inline_command_buffer_create(
-        base_device, mode, command_categories, queue_affinity, binding_capacity,
+        iree_hal_device_allocator(base_device), mode, command_categories,
+        queue_affinity, binding_capacity,
         iree_hal_device_host_allocator(base_device), out_command_buffer);
   } else {
     iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
     return iree_hal_deferred_command_buffer_create(
-        base_device, mode, command_categories, binding_capacity,
-        &device->large_block_pool, device->host_allocator, out_command_buffer);
+        iree_hal_device_allocator(base_device), mode, command_categories,
+        binding_capacity, &device->large_block_pool, device->host_allocator,
+        out_command_buffer);
   }
 }
 
@@ -415,7 +417,7 @@
       // binding tables and can be validated entirely while recording.
       iree_hal_command_buffer_t* inline_command_buffer = NULL;
       IREE_RETURN_IF_ERROR(iree_hal_inline_command_buffer_initialize(
-          (iree_hal_device_t*)device,
+          device->device_allocator,
           iree_hal_command_buffer_mode(command_buffer) |
               IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION |
               IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED,
diff --git a/runtime/src/iree/hal/drivers/local_task/BUILD.bazel b/runtime/src/iree/hal/drivers/local_task/BUILD.bazel
index 7586334..b6231bf 100644
--- a/runtime/src/iree/hal/drivers/local_task/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/local_task/BUILD.bazel
@@ -47,6 +47,7 @@
         "//runtime/src/iree/hal/local",
         "//runtime/src/iree/hal/local:executable_environment",
         "//runtime/src/iree/hal/local:executable_library",
+        "//runtime/src/iree/hal/utils:deferred_command_buffer",
         "//runtime/src/iree/hal/utils:file_transfer",
         "//runtime/src/iree/hal/utils:memory_file",
         "//runtime/src/iree/hal/utils:resource_set",
diff --git a/runtime/src/iree/hal/drivers/local_task/CMakeLists.txt b/runtime/src/iree/hal/drivers/local_task/CMakeLists.txt
index 4d2b0ba..ba8f2a9 100644
--- a/runtime/src/iree/hal/drivers/local_task/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/local_task/CMakeLists.txt
@@ -41,6 +41,7 @@
     iree::hal::local
     iree::hal::local::executable_environment
     iree::hal::local::executable_library
+    iree::hal::utils::deferred_command_buffer
     iree::hal::utils::file_transfer
     iree::hal::utils::memory_file
     iree::hal::utils::resource_set
diff --git a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
index b513c54..95c503e 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
@@ -106,7 +106,7 @@
 }
 
 iree_status_t iree_hal_task_command_buffer_create(
-    iree_hal_device_t* device, iree_task_scope_t* scope,
+    iree_hal_allocator_t* device_allocator, iree_task_scope_t* scope,
     iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
@@ -144,8 +144,8 @@
       (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
-        device, mode, command_categories, queue_affinity, binding_capacity,
-        (uint8_t*)command_buffer + sizeof(*command_buffer),
+        device_allocator, mode, command_categories, queue_affinity,
+        binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
         &iree_hal_task_command_buffer_vtable, &command_buffer->base);
     command_buffer->host_allocator = host_allocator;
     command_buffer->scope = scope;
diff --git a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.h b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.h
index ee7e542..59fe300 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.h
+++ b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.h
@@ -19,7 +19,7 @@
 #endif  // __cplusplus
 
 iree_status_t iree_hal_task_command_buffer_create(
-    iree_hal_device_t* device, iree_task_scope_t* scope,
+    iree_hal_allocator_t* device_allocator, iree_task_scope_t* scope,
     iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
diff --git a/runtime/src/iree/hal/drivers/local_task/task_device.c b/runtime/src/iree/hal/drivers/local_task/task_device.c
index d84f7f5..88f35f6 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_device.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_device.c
@@ -19,6 +19,7 @@
 #include "iree/hal/local/executable_environment.h"
 #include "iree/hal/local/local_executable_cache.h"
 #include "iree/hal/local/local_pipeline_layout.h"
+#include "iree/hal/utils/deferred_command_buffer.h"
 #include "iree/hal/utils/file_transfer.h"
 #include "iree/hal/utils/memory_file.h"
 
@@ -132,9 +133,12 @@
     device->queue_count = queue_count;
     for (iree_host_size_t i = 0; i < device->queue_count; ++i) {
       // TODO(benvanik): add a number to each queue ID.
+      iree_hal_queue_affinity_t queue_affinity = 1ull << i;
       iree_hal_task_queue_initialize(
-          device->identifier, params->queue_scope_flags, queue_executors[i],
-          &device->small_block_pool, &device->queues[i]);
+          device->identifier, queue_affinity, params->queue_scope_flags,
+          queue_executors[i], &device->small_block_pool,
+          &device->large_block_pool, device->device_allocator,
+          &device->queues[i]);
     }
   }
 
@@ -292,12 +296,25 @@
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
-  iree_host_size_t queue_index = iree_hal_task_device_select_queue(
-      device, command_categories, queue_affinity);
-  return iree_hal_task_command_buffer_create(
-      base_device, &device->queues[queue_index].scope, mode, command_categories,
-      queue_affinity, binding_capacity, &device->large_block_pool,
-      device->host_allocator, out_command_buffer);
+  if (binding_capacity > 0) {
+    // TODO(indirect-cmd): natively support reusable task command buffers. For
+    // now we emulate by recording into a deferred command buffer and
+    // recording/issuing at submission time. The task system needs some
+    // reworking to support being able to resubmit task graphs as today it is
+    // destructive.
+    return iree_hal_deferred_command_buffer_create(
+        iree_hal_device_allocator(base_device), mode, command_categories,
+        binding_capacity, &device->large_block_pool, device->host_allocator,
+        out_command_buffer);
+  } else {
+    iree_host_size_t queue_index = iree_hal_task_device_select_queue(
+        device, command_categories, queue_affinity);
+    return iree_hal_task_command_buffer_create(
+        iree_hal_device_allocator(base_device),
+        &device->queues[queue_index].scope, mode, command_categories,
+        queue_affinity, binding_capacity, &device->large_block_pool,
+        device->host_allocator, out_command_buffer);
+  }
 }
 
 static iree_status_t iree_hal_task_device_create_descriptor_set_layout(
@@ -471,11 +488,12 @@
                                               wait_semaphore_list,
                                               signal_semaphore_list);
   }
-  iree_hal_submission_batch_t batch = {
+  iree_hal_task_submission_batch_t batch = {
       .wait_semaphores = wait_semaphore_list,
       .signal_semaphores = signal_semaphore_list,
       .command_buffer_count = command_buffer_count,
       .command_buffers = command_buffers,
+      .binding_tables = binding_tables,
   };
   return iree_hal_task_queue_submit_commands(&device->queues[queue_index], 1,
                                              &batch);
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.c b/runtime/src/iree/hal/drivers/local_task/task_queue.c
index 985ad2e..ebf093d 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.c
@@ -11,6 +11,8 @@
 
 #include "iree/hal/drivers/local_task/task_command_buffer.h"
 #include "iree/hal/drivers/local_task/task_semaphore.h"
+#include "iree/hal/utils/deferred_command_buffer.h"
+#include "iree/hal/utils/resource_set.h"
 #include "iree/task/submission.h"
 
 // Each submission is turned into a DAG for execution:
@@ -186,11 +188,64 @@
   // if we are the last issue pending.
   iree_hal_task_queue_t* queue;
 
-  // Command buffers to be issued in the order the appeared in the submission.
+  // A resource set containing all binding table buffers.
+  iree_hal_resource_set_t* resource_set;
+
+  // Command buffers to be issued in the order they appeared in the submission.
   iree_host_size_t command_buffer_count;
-  iree_hal_command_buffer_t* command_buffers[];
+  iree_hal_command_buffer_t** command_buffers;
+  iree_hal_buffer_binding_table_t* binding_tables;
 } iree_hal_task_queue_issue_cmd_t;
 
+static iree_status_t iree_hal_task_queue_issue_cmd_deferred(
+    iree_hal_task_queue_issue_cmd_t* cmd,
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_binding_table_t binding_table,
+    iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Create a transient command buffer that we'll apply the deferred commands
+  // into. It will live beyond this function as we'll issue the commands but
+  // they may not run immediately.
+  iree_hal_command_buffer_t* task_command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_task_command_buffer_create(
+              cmd->queue->device_allocator, &cmd->queue->scope,
+              iree_hal_command_buffer_mode(command_buffer),
+              iree_hal_command_buffer_allowed_categories(command_buffer),
+              cmd->queue->affinity, /*binding_capacity=*/0,
+              cmd->queue->large_block_pool,
+              iree_hal_allocator_host_allocator(cmd->queue->device_allocator),
+              &task_command_buffer));
+
+  // Keep the command buffer live until the queue operation completes.
+  iree_status_t status =
+      iree_hal_resource_set_insert(cmd->resource_set, 1, &task_command_buffer);
+  if (!iree_status_is_ok(status)) {
+    iree_hal_command_buffer_release(task_command_buffer);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
+  // Replay the commands from the deferred command buffer into the new task one.
+  // This creates the task graph and captures the binding references but does
+  // not yet issue the commands.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_deferred_command_buffer_apply(
+              command_buffer, task_command_buffer, binding_table));
+
+  // Issue the task command buffer as if it had been recorded directly to begin
+  // with.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_task_command_buffer_issue(task_command_buffer,
+                                             &cmd->queue->state,
+                                             cmd->task.header.completion_task,
+                                             cmd->arena, pending_submission));
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
 // Issues a set of command buffers without waiting for them to complete.
 static iree_status_t iree_hal_task_queue_issue_cmd(
     void* user_context, iree_task_t* task,
@@ -203,10 +258,23 @@
   // NOTE: it's ok for there to be no command buffers - in that case the
   // submission was purely for synchronization.
   for (iree_host_size_t i = 0; i < cmd->command_buffer_count; ++i) {
-    if (iree_hal_task_command_buffer_isa(cmd->command_buffers[i])) {
-      status = iree_hal_task_command_buffer_issue(
-          cmd->command_buffers[i], &cmd->queue->state,
-          cmd->task.header.completion_task, cmd->arena, pending_submission);
+    iree_hal_command_buffer_t* command_buffer = cmd->command_buffers[i];
+    if (iree_hal_task_command_buffer_isa(command_buffer)) {
+      if (cmd->binding_tables && cmd->binding_tables[i].count > 0) {
+        status = iree_make_status(
+            IREE_STATUS_UNIMPLEMENTED,
+            "task command buffers do not support binding tables yet");
+      } else {
+        status = iree_hal_task_command_buffer_issue(
+            command_buffer, &cmd->queue->state,
+            cmd->task.header.completion_task, cmd->arena, pending_submission);
+      }
+    } else if (iree_hal_deferred_command_buffer_isa(command_buffer)) {
+      iree_hal_buffer_binding_table_t binding_table =
+          cmd->binding_tables ? cmd->binding_tables[i]
+                              : iree_hal_buffer_binding_table_empty();
+      status = iree_hal_task_queue_issue_cmd_deferred(
+          cmd, command_buffer, binding_table, pending_submission);
     } else {
       status = iree_make_status(
           IREE_STATUS_UNIMPLEMENTED,
@@ -219,30 +287,112 @@
   return status;
 }
 
+// Cleanup for iree_hal_task_queue_issue_cmd_t that releases the retained
+// semaphores.
+static void iree_hal_task_queue_issue_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
+  iree_hal_task_queue_issue_cmd_t* cmd = (iree_hal_task_queue_issue_cmd_t*)task;
+  if (cmd->resource_set) {
+    IREE_TRACE_ZONE_BEGIN(z0);
+    iree_hal_resource_set_free(cmd->resource_set);
+    IREE_TRACE_ZONE_END(z0);
+  }
+}
+
 // Allocates and initializes a iree_hal_task_queue_issue_cmd_t task.
 static iree_status_t iree_hal_task_queue_issue_cmd_allocate(
     void* user_data, iree_task_scope_t* scope, iree_hal_task_queue_t* queue,
     iree_host_size_t resource_count, iree_hal_resource_t* const* resources,
     iree_task_t* retire_task, iree_arena_allocator_t* arena,
     iree_task_t** out_issue_task) {
+  iree_hal_task_submission_batch_t* batch =
+      (iree_hal_task_submission_batch_t*)user_data;
+
   iree_hal_task_queue_issue_cmd_t* cmd = NULL;
-  iree_host_size_t total_cmd_size =
-      sizeof(*cmd) + resource_count * sizeof(*cmd->command_buffers);
+  iree_host_size_t command_buffers_size =
+      batch->command_buffer_count * sizeof(*cmd->command_buffers);
+  iree_host_size_t binding_tables_size = 0;
+  iree_host_size_t binding_table_elements_size = 0;
+  if (batch->binding_tables) {
+    binding_tables_size =
+        batch->command_buffer_count * sizeof(*cmd->binding_tables);
+    for (iree_host_size_t i = 0; i < batch->command_buffer_count; ++i) {
+      binding_table_elements_size += batch->binding_tables[i].count *
+                                     sizeof(*batch->binding_tables[i].bindings);
+    }
+  }
+  iree_host_size_t total_cmd_size = sizeof(*cmd) + command_buffers_size +
+                                    binding_tables_size +
+                                    binding_table_elements_size;
   IREE_RETURN_IF_ERROR(
       iree_arena_allocate(arena, total_cmd_size, (void**)&cmd));
   iree_task_call_initialize(
       scope, iree_task_make_call_closure(iree_hal_task_queue_issue_cmd, 0),
       &cmd->task);
+  iree_task_set_cleanup_fn(&cmd->task.header,
+                           iree_hal_task_queue_issue_cmd_cleanup);
   iree_task_set_completion_task(&cmd->task.header, retire_task);
   cmd->arena = arena;
   cmd->queue = queue;
 
-  cmd->command_buffer_count = resource_count;
-  memcpy(cmd->command_buffers, resources,
-         resource_count * sizeof(cmd->command_buffers[0]));
+  cmd->command_buffer_count = batch->command_buffer_count;
+  cmd->command_buffers =
+      (iree_hal_command_buffer_t**)((uint8_t*)cmd + sizeof(*cmd));
+  bool has_any_deferred = false;
+  for (iree_host_size_t i = 0; i < batch->command_buffer_count; ++i) {
+    iree_hal_command_buffer_t* command_buffer = batch->command_buffers[i];
+    cmd->command_buffers[i] = batch->command_buffers[i];
+    has_any_deferred |= iree_hal_deferred_command_buffer_isa(command_buffer);
+  }
 
-  *out_issue_task = &cmd->task.header;
-  return iree_ok_status();
+  // Only create a resource set if we know we need it.
+  // NOTE: if this fails we'll unwind and release the cmd arena in the caller.
+  if (has_any_deferred || binding_table_elements_size > 0) {
+    IREE_RETURN_IF_ERROR(
+        iree_hal_resource_set_allocate(arena->block_pool, &cmd->resource_set));
+  } else {
+    cmd->resource_set = NULL;
+  }
+
+  // Binding tables are optional and we only need this extra work if there were
+  // any non-empty binding tables provided during submission.
+  iree_status_t status = iree_ok_status();
+  if (binding_table_elements_size > 0) {
+    // Copy over binding tables and all of their contents.
+    cmd->binding_tables =
+        (iree_hal_buffer_binding_table_t*)((uint8_t*)cmd->command_buffers +
+                                           command_buffers_size);
+    iree_hal_buffer_binding_t* binding_element_ptr =
+        (iree_hal_buffer_binding_t*)(cmd->binding_tables + binding_tables_size);
+    for (iree_host_size_t i = 0; i < batch->command_buffer_count; ++i) {
+      iree_host_size_t element_count = batch->binding_tables[i].count;
+      cmd->binding_tables[i].count = element_count;
+      cmd->binding_tables[i].bindings = binding_element_ptr;
+      memcpy((void*)cmd->binding_tables[i].bindings,
+             batch->binding_tables[i].bindings,
+             element_count * sizeof(*binding_element_ptr));
+      binding_element_ptr += element_count;
+
+      // Bulk insert all bindings into the resource set. This will keep the
+      // referenced buffers live until the issue has completed. Note that if we
+      // fail here we need to clean up the resource set below before returning.
+      status = iree_hal_resource_set_insert_strided(
+          cmd->resource_set, element_count, cmd->binding_tables[i].bindings,
+          offsetof(iree_hal_buffer_binding_t, buffer),
+          sizeof(iree_hal_buffer_binding_t));
+      if (!iree_status_is_ok(status)) break;
+    }
+  } else {
+    cmd->binding_tables = NULL;
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_issue_task = &cmd->task.header;
+  } else {
+    iree_hal_resource_set_free(cmd->resource_set);
+    cmd->resource_set = NULL;
+  }
+  return status;
 }
 
 //===----------------------------------------------------------------------===//
@@ -401,18 +551,25 @@
 //===----------------------------------------------------------------------===//
 
 void iree_hal_task_queue_initialize(iree_string_view_t identifier,
+                                    iree_hal_queue_affinity_t affinity,
                                     iree_task_scope_flags_t scope_flags,
                                     iree_task_executor_t* executor,
-                                    iree_arena_block_pool_t* block_pool,
+                                    iree_arena_block_pool_t* small_block_pool,
+                                    iree_arena_block_pool_t* large_block_pool,
+                                    iree_hal_allocator_t* device_allocator,
                                     iree_hal_task_queue_t* out_queue) {
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_TEXT(z0, identifier.data, identifier.size);
 
   memset(out_queue, 0, sizeof(*out_queue));
 
+  out_queue->affinity = affinity;
   out_queue->executor = executor;
   iree_task_executor_retain(out_queue->executor);
-  out_queue->block_pool = block_pool;
+  out_queue->small_block_pool = small_block_pool;
+  out_queue->large_block_pool = large_block_pool;
+  out_queue->device_allocator = device_allocator;
+  iree_hal_allocator_retain(out_queue->device_allocator);
 
   iree_task_scope_initialize(identifier, scope_flags, &out_queue->scope);
 
@@ -429,6 +586,7 @@
 
   iree_hal_task_queue_state_deinitialize(&queue->state);
   iree_task_scope_deinitialize(&queue->scope);
+  iree_hal_allocator_release(queue->device_allocator);
   iree_task_executor_release(queue->executor);
 
   IREE_TRACE_ZONE_END(z0);
@@ -456,7 +614,7 @@
   iree_hal_task_queue_retire_cmd_t* retire_cmd = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_task_queue_retire_cmd_allocate(
       &queue->scope, resource_count, resources, &signal_semaphores,
-      queue->block_pool, &retire_cmd));
+      queue->small_block_pool, &retire_cmd));
 
   // NOTE: if we fail from here on we must drop the retire_cmd arena.
   iree_status_t status = iree_ok_status();
@@ -527,12 +685,12 @@
 
 static iree_status_t iree_hal_task_queue_submit_batches(
     iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches) {
+    const iree_hal_task_submission_batch_t* batches) {
   // For now we process each batch independently. To elide additional semaphore
   // work and prevent unneeded coordinator scheduling logic we could instead
   // build the whole DAG prior to submitting.
   for (iree_host_size_t i = 0; i < batch_count; ++i) {
-    const iree_hal_submission_batch_t* batch = &batches[i];
+    const iree_hal_task_submission_batch_t* batch = &batches[i];
     IREE_RETURN_IF_ERROR(iree_hal_task_queue_submit(
         queue, batch->wait_semaphores, batch->signal_semaphores,
         batch->command_buffer_count,
@@ -544,7 +702,7 @@
 
 iree_status_t iree_hal_task_queue_submit_commands(
     iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches) {
+    const iree_hal_task_submission_batch_t* batches) {
   IREE_TRACE_ZONE_BEGIN(z0);
   iree_status_t status =
       iree_hal_task_queue_submit_batches(queue, batch_count, batches);
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.h b/runtime/src/iree/hal/drivers/local_task/task_queue.h
index 74cabd8..0d667ae 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.h
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.h
@@ -22,17 +22,47 @@
 extern "C" {
 #endif  // __cplusplus
 
+// A single batch of command buffers submitted to a device queue.
+// All of the wait semaphores must reach or exceed the given payload values
+// prior to the batch beginning execution. Each command buffer begins execution
+// in the order it is present in the list, though note that the command buffers
+// execute concurrently and require internal synchronization via events if there
+// are any dependencies between them. Only after all command buffers have
+// completed will the signal semaphores be updated to the provided payload
+// values.
+typedef struct iree_hal_task_submission_batch_t {
+  // Semaphores to wait on prior to executing any command buffer.
+  iree_hal_semaphore_list_t wait_semaphores;
+
+  // Command buffers to execute, in order, and optional binding tables 1:1.
+  iree_host_size_t command_buffer_count;
+  iree_hal_command_buffer_t* const* command_buffers;
+  iree_hal_buffer_binding_table_t const* binding_tables;
+
+  // Semaphores to signal once all command buffers have completed execution.
+  iree_hal_semaphore_list_t signal_semaphores;
+} iree_hal_task_submission_batch_t;
+
 typedef struct iree_hal_task_queue_t {
+  // Affinity mask this queue processes.
+  iree_hal_queue_affinity_t affinity;
+
   // Shared executor that the queue submits tasks to.
   iree_task_executor_t* executor;
 
   // Shared block pool for allocating submission transients (tasks/events/etc).
-  iree_arena_block_pool_t* block_pool;
+  iree_arena_block_pool_t* small_block_pool;
+  // Shared block pool for large allocations (command buffers/etc).
+  iree_arena_block_pool_t* large_block_pool;
+
+  // Device allocator used for transient allocations/tracking.
+  iree_hal_allocator_t* device_allocator;
 
   // Scope used for all tasks in the queue.
   // This allows for easy waits on all outstanding queue tasks as well as
   // differentiation of tasks within the executor.
   iree_task_scope_t scope;
+
   // State tracking used during command buffer issue.
   // The intra-queue synchronization (barriers/events) carries across command
   // buffers and this is used to rendezvous the tasks in each set.
@@ -40,9 +70,12 @@
 } iree_hal_task_queue_t;
 
 void iree_hal_task_queue_initialize(iree_string_view_t identifier,
+                                    iree_hal_queue_affinity_t affinity,
                                     iree_task_scope_flags_t scope_flags,
                                     iree_task_executor_t* executor,
-                                    iree_arena_block_pool_t* block_pool,
+                                    iree_arena_block_pool_t* small_block_pool,
+                                    iree_arena_block_pool_t* large_block_pool,
+                                    iree_hal_allocator_t* device_allocator,
                                     iree_hal_task_queue_t* out_queue);
 
 void iree_hal_task_queue_deinitialize(iree_hal_task_queue_t* queue);
@@ -55,7 +88,7 @@
 
 iree_status_t iree_hal_task_queue_submit_commands(
     iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches);
+    const iree_hal_task_submission_batch_t* batches);
 
 iree_status_t iree_hal_task_queue_submit_callback(
     iree_hal_task_queue_t* queue, iree_hal_semaphore_list_t wait_semaphores,
diff --git a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
index a21eaff..50d01e7 100644
--- a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
+++ b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
@@ -359,8 +359,8 @@
                                                           mode, binding_capacity),
                             (void**)&command_buffer));
 
-  iree_hal_command_buffer_initialize(device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
-                                     binding_capacity,
+  iree_hal_command_buffer_initialize(iree_hal_device_allocator(device), mode, command_categories,
+                                     IREE_HAL_QUEUE_AFFINITY_ANY, binding_capacity,
                                      (uint8_t*)command_buffer + sizeof(*command_buffer),
                                      &iree_hal_metal_command_buffer_vtable, &command_buffer->base);
   command_buffer->device = device;
diff --git a/runtime/src/iree/hal/drivers/vulkan/command_queue.h b/runtime/src/iree/hal/drivers/vulkan/command_queue.h
index 051f873..52fcfa3 100644
--- a/runtime/src/iree/hal/drivers/vulkan/command_queue.h
+++ b/runtime/src/iree/hal/drivers/vulkan/command_queue.h
@@ -20,6 +20,32 @@
 namespace hal {
 namespace vulkan {
 
+// A single batch of command buffers submitted to a device queue.
+// All of the wait semaphores must reach or exceed the given payload values
+// prior to the batch beginning execution. Each command buffer begins execution
+// in the order it is present in the list, though note that the command buffers
+// execute concurrently and require internal synchronization via events if there
+// are any dependencies between them. Only after all command buffers have
+// completed will the signal semaphores be updated to the provided payload
+// values.
+//
+// Matches Vulkan's VkSubmitInfo:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkSubmitInfo.html
+// Note that as the HAL only models timeline semaphores we take the payload
+// values directly in this struct; see:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkTimelineSemaphoreSubmitInfo.html
+typedef struct iree_hal_vulkan_submission_batch_t {
+  // Semaphores to wait on prior to executing any command buffer.
+  iree_hal_semaphore_list_t wait_semaphores;
+
+  // Command buffers to execute, in order.
+  iree_host_size_t command_buffer_count;
+  iree_hal_command_buffer_t* const* command_buffers;
+
+  // Semaphores to signal once all command buffers have completed execution.
+  iree_hal_semaphore_list_t signal_semaphores;
+} iree_hal_vulkan_submission_batch_t;
+
 class CommandQueue {
  public:
   virtual ~CommandQueue() {
@@ -47,8 +73,9 @@
     return iree_all_bits_set(supported_categories_,
                              IREE_HAL_COMMAND_CATEGORY_DISPATCH);
   }
-  virtual iree_status_t Submit(iree_host_size_t batch_count,
-                               const iree_hal_submission_batch_t* batches) = 0;
+  virtual iree_status_t Submit(
+      iree_host_size_t batch_count,
+      const iree_hal_vulkan_submission_batch_t* batches) = 0;
 
   virtual iree_status_t WaitIdle(iree_timeout_t timeout) = 0;
 
diff --git a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
index d956347..b66be80 100644
--- a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
@@ -74,7 +74,7 @@
 }
 
 iree_status_t iree_hal_vulkan_direct_command_buffer_allocate(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     iree::hal::vulkan::VkDeviceHandle* logical_device,
     iree::hal::vulkan::VkCommandPoolHandle* command_pool,
     iree_hal_command_buffer_mode_t mode,
@@ -85,6 +85,7 @@
     iree::hal::vulkan::BuiltinExecutables* builtin_executables,
     iree_arena_block_pool_t* block_pool,
     iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(device_allocator);
   IREE_ASSERT_ARGUMENT(logical_device);
   IREE_ASSERT_ARGUMENT(command_pool);
   IREE_ASSERT_ARGUMENT(descriptor_pool_cache);
@@ -119,8 +120,8 @@
       (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
-        device, mode, command_categories, queue_affinity, binding_capacity,
-        (uint8_t*)command_buffer + sizeof(*command_buffer),
+        device_allocator, mode, command_categories, queue_affinity,
+        binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
         &iree_hal_vulkan_direct_command_buffer_vtable, &command_buffer->base);
     command_buffer->logical_device = logical_device;
     command_buffer->tracing_context = tracing_context;
diff --git a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.h b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.h
index 38c59dc..ada33b7 100644
--- a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.h
+++ b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.h
@@ -25,7 +25,7 @@
 // NOTE: the |block_pool| must remain live for the lifetime of the command
 // buffers that use it.
 iree_status_t iree_hal_vulkan_direct_command_buffer_allocate(
-    iree_hal_device_t* device,
+    iree_hal_allocator_t* device_allocator,
     iree::hal::vulkan::VkDeviceHandle* logical_device,
     iree::hal::vulkan::VkCommandPoolHandle* command_pool,
     iree_hal_command_buffer_mode_t mode,
diff --git a/runtime/src/iree/hal/drivers/vulkan/direct_command_queue.cc b/runtime/src/iree/hal/drivers/vulkan/direct_command_queue.cc
index 5c6d449..8aabef7 100644
--- a/runtime/src/iree/hal/drivers/vulkan/direct_command_queue.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/direct_command_queue.cc
@@ -27,7 +27,7 @@
 DirectCommandQueue::~DirectCommandQueue() = default;
 
 iree_status_t DirectCommandQueue::TranslateBatchInfo(
-    const iree_hal_submission_batch_t* batch, VkSubmitInfo* submit_info,
+    const iree_hal_vulkan_submission_batch_t* batch, VkSubmitInfo* submit_info,
     VkTimelineSemaphoreSubmitInfo* timeline_submit_info, Arena* arena) {
   // TODO(benvanik): see if we can go to finer-grained stages.
   // For example, if this was just queue ownership transfers then we can use
@@ -92,7 +92,8 @@
 }
 
 iree_status_t DirectCommandQueue::Submit(
-    iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) {
+    iree_host_size_t batch_count,
+    const iree_hal_vulkan_submission_batch_t* batches) {
   IREE_TRACE_SCOPE_NAMED("DirectCommandQueue::Submit");
 
   // Map the submission batches to VkSubmitInfos.
diff --git a/runtime/src/iree/hal/drivers/vulkan/direct_command_queue.h b/runtime/src/iree/hal/drivers/vulkan/direct_command_queue.h
index 54ae2f2..5036ba3 100644
--- a/runtime/src/iree/hal/drivers/vulkan/direct_command_queue.h
+++ b/runtime/src/iree/hal/drivers/vulkan/direct_command_queue.h
@@ -25,14 +25,16 @@
                      VkQueue queue);
   ~DirectCommandQueue() override;
 
-  iree_status_t Submit(iree_host_size_t batch_count,
-                       const iree_hal_submission_batch_t* batches) override;
+  iree_status_t Submit(
+      iree_host_size_t batch_count,
+      const iree_hal_vulkan_submission_batch_t* batches) override;
 
   iree_status_t WaitIdle(iree_timeout_t timeout) override;
 
  private:
   iree_status_t TranslateBatchInfo(
-      const iree_hal_submission_batch_t* batch, VkSubmitInfo* submit_info,
+      const iree_hal_vulkan_submission_batch_t* batch,
+      VkSubmitInfo* submit_info,
       VkTimelineSemaphoreSubmitInfo* timeline_submit_info, Arena* arena);
 };
 
diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
index 7f362fd..bc7d64e 100644
--- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
@@ -1531,9 +1531,9 @@
   // command buffer when submitted with bindings.
   if (binding_capacity > 0) {
     return iree_hal_deferred_command_buffer_create(
-        base_device, mode, command_categories, binding_capacity,
-        &device->block_pool, iree_hal_device_host_allocator(base_device),
-        out_command_buffer);
+        iree_hal_device_allocator(base_device), mode, command_categories,
+        binding_capacity, &device->block_pool,
+        iree_hal_device_host_allocator(base_device), out_command_buffer);
   }
 
   // TODO(scotttodd): revisit queue selection logic and remove this
@@ -1565,8 +1565,8 @@
       device, command_categories, queue_affinity);
 
   return iree_hal_vulkan_direct_command_buffer_allocate(
-      base_device, device->logical_device, command_pool, mode,
-      command_categories, queue_affinity, binding_capacity,
+      iree_hal_device_allocator(base_device), device->logical_device,
+      command_pool, mode, command_categories, queue_affinity, binding_capacity,
       queue->tracing_context(), device->descriptor_pool_cache,
       device->builtin_executables, &device->block_pool, out_command_buffer);
 }
@@ -1763,7 +1763,7 @@
   }
 
   if (iree_status_is_ok(status)) {
-    iree_hal_submission_batch_t batch = {
+    iree_hal_vulkan_submission_batch_t batch = {
         /*.wait_semaphores=*/wait_semaphore_list,
         /*.command_buffer_count=*/command_buffer_count,
         /*.command_buffers=*/translated_command_buffers,
diff --git a/runtime/src/iree/hal/local/inline_command_buffer.c b/runtime/src/iree/hal/local/inline_command_buffer.c
index f055e98..f69f9c2 100644
--- a/runtime/src/iree/hal/local/inline_command_buffer.c
+++ b/runtime/src/iree/hal/local/inline_command_buffer.c
@@ -92,7 +92,7 @@
 }
 
 iree_status_t iree_hal_inline_command_buffer_initialize(
-    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_allocator_t host_allocator, iree_byte_span_t storage,
@@ -130,8 +130,8 @@
   memset(command_buffer, 0, sizeof(*command_buffer));
 
   iree_hal_command_buffer_initialize(
-      device, mode, command_categories, queue_affinity, binding_capacity,
-      (uint8_t*)command_buffer + sizeof(*command_buffer),
+      device_allocator, mode, command_categories, queue_affinity,
+      binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
       &iree_hal_inline_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
   iree_hal_inline_command_buffer_reset(command_buffer);
@@ -150,7 +150,7 @@
 }
 
 iree_status_t iree_hal_inline_command_buffer_create(
-    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_allocator_t host_allocator,
@@ -167,8 +167,8 @@
   iree_hal_command_buffer_t* command_buffer = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_inline_command_buffer_initialize(
-        device, mode, command_categories, queue_affinity, binding_capacity,
-        host_allocator,
+        device_allocator, mode, command_categories, queue_affinity,
+        binding_capacity, host_allocator,
         iree_make_byte_span(storage, iree_hal_inline_command_buffer_size(
                                          mode, binding_capacity)),
         &command_buffer);
diff --git a/runtime/src/iree/hal/local/inline_command_buffer.h b/runtime/src/iree/hal/local/inline_command_buffer.h
index b0214ae..0acb7fc 100644
--- a/runtime/src/iree/hal/local/inline_command_buffer.h
+++ b/runtime/src/iree/hal/local/inline_command_buffer.h
@@ -29,7 +29,7 @@
 // the caller: attempting to use the resulting command buffer as a ref object
 // is invalid.
 iree_status_t iree_hal_inline_command_buffer_initialize(
-    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_allocator_t host_allocator, iree_byte_span_t storage,
@@ -50,7 +50,7 @@
 //
 // Must have IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION set.
 iree_status_t iree_hal_inline_command_buffer_create(
-    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_allocator_t host_allocator,
diff --git a/runtime/src/iree/hal/utils/deferred_command_buffer.c b/runtime/src/iree/hal/utils/deferred_command_buffer.c
index 485c2e4..efc4444 100644
--- a/runtime/src/iree/hal/utils/deferred_command_buffer.c
+++ b/runtime/src/iree/hal/utils/deferred_command_buffer.c
@@ -161,7 +161,7 @@
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_deferred_command_buffer_create(
-    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool,
     iree_allocator_t host_allocator,
@@ -179,7 +179,7 @@
       (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
-        device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
+        device_allocator, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
         binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
         &iree_hal_deferred_command_buffer_vtable, &command_buffer->base);
     command_buffer->host_allocator = host_allocator;
diff --git a/runtime/src/iree/hal/utils/deferred_command_buffer.h b/runtime/src/iree/hal/utils/deferred_command_buffer.h
index 8129ecd..500c405 100644
--- a/runtime/src/iree/hal/utils/deferred_command_buffer.h
+++ b/runtime/src/iree/hal/utils/deferred_command_buffer.h
@@ -42,7 +42,7 @@
 // the sequence of commands against a target command buffer implementation.
 // The command buffer can be replayed multiple times.
 IREE_API_EXPORT iree_status_t iree_hal_deferred_command_buffer_create(
-    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool,
     iree_allocator_t host_allocator,
diff --git a/runtime/src/iree/hal/utils/resource_set.c b/runtime/src/iree/hal/utils/resource_set.c
index ff82fdb..a807fd6 100644
--- a/runtime/src/iree/hal/utils/resource_set.c
+++ b/runtime/src/iree/hal/utils/resource_set.c
@@ -93,6 +93,7 @@
 }
 
 IREE_API_EXPORT void iree_hal_resource_set_free(iree_hal_resource_set_t* set) {
+  if (!set) return;
   IREE_TRACE_ZONE_BEGIN(z0);
 
 #if defined(IREE_SANITIZER_ADDRESS)
@@ -121,6 +122,7 @@
 
 IREE_API_EXPORT void iree_hal_resource_set_freeze(
     iree_hal_resource_set_t* set) {
+  if (!set) return;
 #if defined(IREE_SANITIZER_ADDRESS)
   // Poison all chunks until the resource set is freed.
   iree_hal_resource_set_chunk_t* chunk = set->chunk_head;
@@ -275,16 +277,27 @@
 IREE_API_EXPORT iree_status_t
 iree_hal_resource_set_insert(iree_hal_resource_set_t* set,
                              iree_host_size_t count, const void* resources) {
+  IREE_ASSERT_ARGUMENT(set);
+  return iree_hal_resource_set_insert_strided(set, count, resources, 0,
+                                              sizeof(iree_hal_resource_t*));
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_resource_set_insert_strided(
+    iree_hal_resource_set_t* set, iree_host_size_t count, const void* elements,
+    iree_host_size_t offset, iree_host_size_t stride) {
+  IREE_ASSERT_ARGUMENT(set);
   // For now we process one at a time. We should have a stride that lets us
   // amortize the cost of doing the MRU update and insertion allocation by
   // say slicing off 4/8/16/32 resources at a time etc. Today each miss that
   // requires a full insertion goes down the whole path of checking chunk
   // capacity and such.
-  iree_hal_resource_t* const* typed_resources =
-      (iree_hal_resource_t* const*)resources;
+  const uint8_t* elements_ptr = (const uint8_t*)elements;
   for (iree_host_size_t i = 0; i < count; ++i) {
-    IREE_RETURN_IF_ERROR(
-        iree_hal_resource_set_insert_1(set, typed_resources[i]));
+    iree_hal_resource_t* resource =
+        *(iree_hal_resource_t**)(elements_ptr + i * stride + offset);
+    if (resource) {
+      IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert_1(set, resource));
+    }
   }
   return iree_ok_status();
 }
diff --git a/runtime/src/iree/hal/utils/resource_set.h b/runtime/src/iree/hal/utils/resource_set.h
index a23e483..f02fcbd 100644
--- a/runtime/src/iree/hal/utils/resource_set.h
+++ b/runtime/src/iree/hal/utils/resource_set.h
@@ -122,10 +122,33 @@
 
 // Inserts zero or more resources into the set.
 // Each resource will be retained for at least the lifetime of the set.
+// Entries will be ignored if NULL.
 IREE_API_EXPORT iree_status_t
 iree_hal_resource_set_insert(iree_hal_resource_set_t* set,
                              iree_host_size_t count, const void* resources);
 
+// Inserts zero or more resources into the set from a user-defined data
+// structure. Each resource will be retained for at least the lifetime of the
+// set. Entries will be ignored if NULL.
+//
+// |elements| should point to the first element of the data structure array,
+// |offset| to the iree_hal_resource_t* pointer within it, and |stride| should
+// be the bytes between that and the subsequent data structure entry. For
+// example, a dense list of resource pointers would have an offset of 0 and a
+// stride of sizeof(iree_hal_resource_t*).
+//
+// Example:
+//   struct my_struct_t {
+//     int something;
+//     iree_hal_resource_t* resource;
+//   } structs[5];
+//   iree_hal_resource_set_insert_strided(set, 5, structs,
+//                                        offsetof(my_struct_t, resource),
+//                                        sizeof(my_struct_t));
+IREE_API_EXPORT iree_status_t iree_hal_resource_set_insert_strided(
+    iree_hal_resource_set_t* set, iree_host_size_t count, const void* data,
+    iree_host_size_t offset, iree_host_size_t stride);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/runtime/src/iree/hal/utils/resource_set_test.cc b/runtime/src/iree/hal/utils/resource_set_test.cc
index 021bb1b..2771d21 100644
--- a/runtime/src/iree/hal/utils/resource_set_test.cc
+++ b/runtime/src/iree/hal/utils/resource_set_test.cc
@@ -148,6 +148,39 @@
   EXPECT_EQ(live_bitmap, 0u);
 }
 
+// Tests inserting multiple resources at a time from a user-defined struct.
+TEST_F(ResourceSetTest, InsertStrided5) {
+  auto resource_set = make_resource_set(&block_pool);
+
+  // Allocate 5 resources - this lets us test for special paths that may handle
+  // 4 at a time (to fit in SIMD registers) as well as the leftovers.
+  struct my_struct_t {
+    int ordinal;
+    iree_hal_resource_t* resource;
+  } structs[5];
+  memset(&structs, 0, sizeof(structs));
+  uint32_t live_bitmap = 0u;
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(structs); ++i) {
+    structs[i].ordinal = i;
+    IREE_ASSERT_OK(iree_hal_test_resource_create(
+        i, &live_bitmap, host_allocator, &structs[i].resource));
+  }
+  EXPECT_EQ(live_bitmap, 0x1Fu);
+
+  // Transfer ownership of the resources to the set.
+  IREE_ASSERT_OK(iree_hal_resource_set_insert_strided(
+      resource_set.get(), IREE_ARRAYSIZE(structs), structs,
+      offsetof(my_struct_t, resource), sizeof(my_struct_t)));
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(structs); ++i) {
+    iree_hal_resource_release(structs[i].resource);
+  }
+  EXPECT_EQ(live_bitmap, 0x1Fu);
+
+  // Ensure the set releases the resources.
+  resource_set.reset();
+  EXPECT_EQ(live_bitmap, 0u);
+}
+
 // Tests inserting enough resources to force set growth. This is ensured by
 // choosing a sufficiently small block size such that even 32 elements triggers
 // a growth. Of course, real usage should have at least ~4KB for the block size.