Making HAL command buffers take buffers as indirect args. (#17730)

This allows for the same command buffer interface to be used for
recording indirect command buffers that reference buffer table slots
instead of having concrete iree_hal_buffer_t pointers available at the
time they are recorded. As part of this first change behavior is kept
largely the same and only the arguments are changed. Future changes per
backend will be needed to actually support driver-side encoding of the
indirect bindings.
diff --git a/experimental/rocm/direct_command_buffer.c b/experimental/rocm/direct_command_buffer.c
index 9bde337..80fade6 100644
--- a/experimental/rocm/direct_command_buffer.c
+++ b/experimental/rocm/direct_command_buffer.c
@@ -76,10 +76,14 @@
                       IREE_HAL_ROCM_MAX_KERNEL_ARG * sizeof(void*) +
                       IREE_HAL_ROCM_MAX_KERNEL_ARG * sizeof(hipDeviceptr_t);
   iree_status_t status = iree_allocator_malloc(
-      context->host_allocator, total_size, (void**)&command_buffer);
+      context->host_allocator,
+      total_size +
+          iree_hal_command_buffer_validation_state_size(mode, binding_capacity),
+      (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
         device, mode, command_categories, queue_affinity, binding_capacity,
+        (uint8_t*)command_buffer + total_size,
         &iree_hal_rocm_direct_command_buffer_vtable, &command_buffer->base);
     command_buffer->context = context;
     command_buffer->tracing_context = tracing_context;
@@ -207,15 +211,15 @@
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   // nothing to do.
   return iree_ok_status();
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   iree_hal_rocm_direct_command_buffer_t* command_buffer =
       iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
@@ -223,11 +227,12 @@
   IREE_ROCM_TRACE_ZONE_BEGIN(command_buffer->tracing_context, 0);
 
   hipDeviceptr_t target_device_buffer = iree_hal_rocm_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
   hipDeviceptr_t dst =
-      (hipDeviceptr_t)((uintptr_t)target_device_buffer + target_offset);
-  size_t num_elements = length / pattern_length;
+      (hipDeviceptr_t)((uintptr_t)target_device_buffer +
+                       iree_hal_buffer_byte_offset(target_ref.buffer) +
+                       target_ref.offset);
+  size_t num_elements = target_ref.length / pattern_length;
   // TODO(raikonenfnu): Currently using NULL stream, need to figure out way to
   // access proper stream from command buffer
   iree_status_t status = iree_ok_status();
@@ -266,8 +271,7 @@
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_rocm_direct_command_buffer_t* command_buffer =
       iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
 
@@ -279,19 +283,19 @@
   // operation and get the wrong data.
   const uint8_t* src = (const uint8_t*)source_buffer + source_offset;
   uint8_t* storage = NULL;
-  IREE_RETURN_IF_ERROR(
-      iree_arena_allocate(&command_buffer->arena, length, (void**)&storage));
-  memcpy(storage, src, length);
+  IREE_RETURN_IF_ERROR(iree_arena_allocate(
+      &command_buffer->arena, target_ref.length, (void**)&storage));
+  memcpy(storage, src, target_ref.length);
   src = storage;
 
   // Issue the copy using the scratch memory as the source.
   hipDeviceptr_t target_device_buffer = iree_hal_rocm_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
   hipDeviceptr_t dst = (uint8_t*)target_device_buffer +
-                       iree_hal_buffer_byte_offset(target_buffer) +
-                       target_offset;
+                       iree_hal_buffer_byte_offset(target_ref.buffer) +
+                       target_ref.offset;
   ROCM_RETURN_IF_ERROR(command_buffer->context->syms,
-                       hipMemcpyHtoDAsync(dst, (void*)src, length,
+                       hipMemcpyHtoDAsync(dst, (void*)src, target_ref.length,
                                           command_buffer->context->rocm_stream),
                        "hipMemcpyHtoDAsync");
 
@@ -300,20 +304,20 @@
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   iree_hal_rocm_direct_command_buffer_t* command_buffer =
       iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
 
   IREE_ROCM_TRACE_ZONE_BEGIN(command_buffer->tracing_context, 0);
 
   hipDeviceptr_t target_device_buffer = iree_hal_rocm_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
   hipDeviceptr_t source_device_buffer = iree_hal_rocm_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(source_buffer));
-  source_offset += iree_hal_buffer_byte_offset(source_buffer);
+      iree_hal_buffer_allocated_buffer(source_ref.buffer));
+  iree_device_size_t source_offset =
+      iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset;
   hipDeviceptr_t dst =
       (hipDeviceptr_t)((uintptr_t)target_device_buffer + target_offset);
   hipDeviceptr_t src =
@@ -322,7 +326,7 @@
   // access proper stream from command buffer
   iree_status_t status = ROCM_RESULT_TO_STATUS(
       command_buffer->context->syms,
-      hipMemcpyAsync(dst, src, length, hipMemcpyDeviceToDevice, 0),
+      hipMemcpyAsync(dst, src, target_ref.length, hipMemcpyDeviceToDevice, 0),
       "hipMemcpyAsync");
 
   IREE_ROCM_TRACE_ZONE_END(command_buffer->tracing_context, 0);
@@ -331,9 +335,8 @@
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "need rocm implementation");
 }
@@ -370,8 +373,7 @@
 static iree_status_t iree_hal_rocm_direct_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   iree_hal_rocm_direct_command_buffer_t* command_buffer =
       iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
   iree_host_size_t base_binding =
@@ -383,7 +385,7 @@
   // argument index.
   iree_hal_rocm_binding_mapping_t binding_used[IREE_HAL_ROCM_MAX_BINDING_COUNT];
   for (iree_host_size_t i = 0; i < binding_count; i++) {
-    iree_hal_rocm_binding_mapping_t buffer = {i, bindings[i].binding};
+    iree_hal_rocm_binding_mapping_t buffer = {i, bindings[i].ordinal};
     binding_used[i] = buffer;
   }
   qsort(binding_used, binding_count, sizeof(iree_hal_rocm_binding_mapping_t),
@@ -391,7 +393,7 @@
   assert(binding_count < IREE_HAL_ROCM_MAX_BINDING_COUNT &&
          "binding count larger than the max expected.");
   for (iree_host_size_t i = 0; i < binding_count; i++) {
-    iree_hal_descriptor_set_binding_t binding = bindings[binding_used[i].index];
+    iree_hal_buffer_ref_t binding = bindings[binding_used[i].index];
     hipDeviceptr_t device_ptr =
         binding.buffer
             ? (hipDeviceptr_t)((uintptr_t)iree_hal_rocm_buffer_device_pointer(
@@ -460,8 +462,7 @@
 static iree_status_t iree_hal_rocm_direct_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "need rocm implementation");
 }
diff --git a/experimental/webgpu/command_buffer.c b/experimental/webgpu/command_buffer.c
index 9a88b21..04ad6ee 100644
--- a/experimental/webgpu/command_buffer.c
+++ b/experimental/webgpu/command_buffer.c
@@ -207,10 +207,14 @@
 
   iree_hal_webgpu_command_buffer_t* command_buffer = NULL;
   iree_status_t status = iree_allocator_malloc(
-      host_allocator, sizeof(*command_buffer), (void**)&command_buffer);
+      host_allocator,
+      sizeof(*command_buffer) +
+          iree_hal_command_buffer_validation_state_size(mode, binding_capacity),
+      (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
         device, mode, command_categories, queue_affinity, binding_capacity,
+        (uint8_t*)command_buffer + sizeof(*command_buffer),
         &iree_hal_webgpu_command_buffer_vtable, &command_buffer->base);
     command_buffer->host_allocator = host_allocator;
     command_buffer->device = device_handle;
@@ -562,7 +566,8 @@
 }
 
 static iree_status_t iree_hal_webgpu_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   // No-op: though maybe it'd be a useful addition to the spec as otherwise
   // false dependencies can creep in.
   return iree_ok_status();
@@ -592,15 +597,15 @@
 
 static iree_status_t iree_hal_webgpu_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   iree_hal_webgpu_command_buffer_t* command_buffer =
       iree_hal_webgpu_command_buffer_cast(base_command_buffer);
 
   iree_hal_webgpu_builtin_fill_buffer_t* builtin =
       &command_buffer->builtins->fill_buffer;
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
 
   // TODO(scotttodd): change to using what the vulkan emulation does
   uint32_t dword_pattern =
@@ -630,7 +635,7 @@
   // buffer is exhausted.
   const uint32_t params_data[] = {
       /*offset=*/target_offset,
-      /*length=*/length,
+      /*length=*/target_ref.length,
       /*pattern=*/dword_pattern,
   };
   uint32_t params_offset = 0;
@@ -657,9 +662,9 @@
   const iree_hal_webgpu_bind_group_binding_t buffer_binding = {
       .type = WGPUBufferBindingType_Storage,
       .buffer = iree_hal_webgpu_buffer_handle(
-          iree_hal_buffer_allocated_buffer(target_buffer)),
+          iree_hal_buffer_allocated_buffer(target_ref.buffer)),
       .offset = 0,
-      .length = length,
+      .length = target_ref.length,
   };
   WGPUBindGroup buffer_group = iree_hal_webgpu_bind_group_cache_acquire(
       command_buffer->bind_group_cache, builtin->buffer_group_layout,
@@ -670,15 +675,15 @@
 
   // NOTE: this is not the right way to do this - we need to be tiling inside
   // the fill.
-  wgpuComputePassEncoderDispatchWorkgroups(compute_pass, length, 1, 1);
+  wgpuComputePassEncoderDispatchWorkgroups(compute_pass, target_ref.length, 1,
+                                           1);
 
   return iree_ok_status();
 }
 
 static iree_status_t iree_hal_webgpu_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_webgpu_command_buffer_t* command_buffer =
       iree_hal_webgpu_command_buffer_cast(base_command_buffer);
 
@@ -690,7 +695,8 @@
   uint8_t* storage_base = NULL;
   iree_hal_webgpu_command_segment_t* segment = NULL;
   iree_status_t status = iree_arena_allocate(
-      &command_buffer->arena, sizeof(*segment) + length, (void**)&storage_base);
+      &command_buffer->arena, sizeof(*segment) + target_ref.length,
+      (void**)&storage_base);
   if (iree_status_is_ok(status)) {
     // Copy the update data into the command buffer so the user can change
     // it immediately after this call returns. This results in a double copy
@@ -707,9 +713,9 @@
     segment->write_buffer.source_buffer = storage_buffer;
     segment->write_buffer.source_offset = 0;
     segment->write_buffer.target_buffer =
-        iree_hal_webgpu_buffer_handle(target_buffer);
-    segment->write_buffer.target_offset = target_offset;
-    segment->write_buffer.length = length;
+        iree_hal_webgpu_buffer_handle(target_ref.buffer);
+    segment->write_buffer.target_offset = target_ref.offset;
+    segment->write_buffer.length = target_ref.length;
     iree_hal_webgpu_command_segment_list_push_back(&command_buffer->segments,
                                                    segment);
   }
@@ -718,9 +724,7 @@
 
 static iree_status_t iree_hal_webgpu_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   iree_hal_webgpu_command_buffer_t* command_buffer =
       iree_hal_webgpu_command_buffer_cast(base_command_buffer);
 
@@ -729,9 +733,9 @@
       command_buffer, &command_encoder));
 
   wgpuCommandEncoderCopyBufferToBuffer(
-      command_encoder, iree_hal_webgpu_buffer_handle(source_buffer),
-      source_offset, iree_hal_webgpu_buffer_handle(target_buffer),
-      target_offset, length);
+      command_encoder, iree_hal_webgpu_buffer_handle(source_ref.buffer),
+      source_ref.offset, iree_hal_webgpu_buffer_handle(target_ref.buffer),
+      target_ref.offset, target_ref.length);
 
   return iree_ok_status();
 }
@@ -761,8 +765,7 @@
 static iree_status_t iree_hal_webgpu_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   iree_hal_webgpu_command_buffer_t* command_buffer =
       iree_hal_webgpu_command_buffer_cast(base_command_buffer);
 
@@ -772,7 +775,7 @@
   iree_hal_webgpu_bind_group_binding_t* group_bindings =
       command_buffer->state.bind_groups[set].bindings;
   for (iree_host_size_t i = 0; i < binding_count; ++i) {
-    uint32_t ordinal = bindings[i].binding;
+    uint32_t ordinal = bindings[i].ordinal;
     if (ordinal >= IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT) {
       return iree_make_status(
           IREE_STATUS_INVALID_ARGUMENT,
@@ -780,7 +783,7 @@
           IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT);
     }
     iree_hal_webgpu_bind_group_binding_t* group_binding =
-        &group_bindings[bindings[i].binding];
+        &group_bindings[ordinal];
 
     // TODO(benvanik): lookup binding type from layout. We should also be
     // tagging whether it's dynamic here.
@@ -897,8 +900,7 @@
 static iree_status_t iree_hal_webgpu_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   iree_hal_webgpu_command_buffer_t* command_buffer =
       iree_hal_webgpu_command_buffer_cast(base_command_buffer);
 
@@ -906,8 +908,8 @@
   IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_prepare_dispatch(
       command_buffer, executable, entry_point, &compute_pass));
   wgpuComputePassEncoderDispatchWorkgroupsIndirect(
-      compute_pass, iree_hal_webgpu_buffer_handle(workgroups_buffer),
-      workgroups_offset);
+      compute_pass, iree_hal_webgpu_buffer_handle(workgroups_ref.buffer),
+      workgroups_ref.offset);
 
   return iree_ok_status();
 }
diff --git a/runtime/bindings/python/hal.cc b/runtime/bindings/python/hal.cc
index ccbf8b0..16952ea 100644
--- a/runtime/bindings/python/hal.cc
+++ b/runtime/bindings/python/hal.cc
@@ -1693,8 +1693,11 @@
             }
             CheckApiStatus(
                 iree_hal_command_buffer_copy_buffer(
-                    self.raw_ptr(), source_buffer.raw_ptr(), source_offset,
-                    target_buffer.raw_ptr(), target_offset, resolved_length),
+                    self.raw_ptr(),
+                    iree_hal_make_buffer_ref(source_buffer.raw_ptr(),
+                                             source_offset, resolved_length),
+                    iree_hal_make_buffer_ref(target_buffer.raw_ptr(),
+                                             target_offset, resolved_length)),
                 "copy command");
             if (end) {
               CheckApiStatus(iree_hal_command_buffer_end(self.raw_ptr()),
@@ -1729,8 +1732,10 @@
             }
             CheckApiStatus(
                 iree_hal_command_buffer_fill_buffer(
-                    self.raw_ptr(), target_buffer.raw_ptr(), target_offset,
-                    resolved_length, pattern_view.buf, pattern_view.len),
+                    self.raw_ptr(),
+                    iree_hal_make_buffer_ref(target_buffer.raw_ptr(),
+                                             target_offset, resolved_length),
+                    pattern_view.buf, pattern_view.len),
                 "command buffer fill");
             if (end) {
               CheckApiStatus(iree_hal_command_buffer_end(self.raw_ptr()),
diff --git a/runtime/src/iree/hal/buffer.h b/runtime/src/iree/hal/buffer.h
index 370af34..586cf77 100644
--- a/runtime/src/iree/hal/buffer.h
+++ b/runtime/src/iree/hal/buffer.h
@@ -153,6 +153,8 @@
 };
 typedef uint16_t iree_hal_memory_access_t;
 
+typedef uint32_t iree_hal_buffer_compatibility_t;
+
 // Bitfield that defines how a buffer is intended to be used.
 // Usage allows the driver to appropriately place the buffer for more
 // efficient operations of the specified types. Validation will fail if a buffer
diff --git a/runtime/src/iree/hal/command_buffer.c b/runtime/src/iree/hal/command_buffer.c
index e2bba37..7da7174 100644
--- a/runtime/src/iree/hal/command_buffer.c
+++ b/runtime/src/iree/hal/command_buffer.c
@@ -22,7 +22,9 @@
       0) {                                                                   \
     expr;                                                                    \
   }
-#define VALIDATION_STATE(command_buffer) (&(command_buffer)->validation)
+#define VALIDATION_STATE(command_buffer)                          \
+  ((iree_hal_command_buffer_validation_state_t*)((command_buffer) \
+                                                     ->validation_state))
 #else
 #define IF_VALIDATING(command_buffer, expr)
 #define VALIDATION_STATE(command_buffer) \
@@ -167,17 +169,45 @@
 
 IREE_HAL_API_RETAIN_RELEASE(command_buffer);
 
+IREE_API_EXPORT iree_host_size_t iree_hal_command_buffer_validation_state_size(
+    iree_hal_command_buffer_mode_t mode, iree_host_size_t binding_capacity) {
+#if IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+  return ((mode & IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED) == 0)
+             ? sizeof(iree_hal_command_buffer_validation_state_t) +
+                   binding_capacity *
+                       sizeof(iree_hal_buffer_binding_requirements_t)
+             : 0;
+#else
+  return 0;
+#endif  // IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+}
+
 IREE_API_EXPORT void iree_hal_command_buffer_initialize(
     iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
-    const iree_hal_command_buffer_vtable_t* vtable,
+    void* validation_state, const iree_hal_command_buffer_vtable_t* vtable,
     iree_hal_command_buffer_t* command_buffer) {
+#if IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+  // If validation is compiled in and the command buffer requires validation
+  // then check that state was provided.
+  IREE_ASSERT(
+      iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED) ||
+      validation_state);
+#else
+  // If validation is not compiled in then force the disable bit. This helps
+  // prevent issues with dynamic libraries that may be compiled with a different
+  // setting, but we don't really support that kind of shady use anyway.
+  mode &= ~IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED;
+#endif  // !IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+
   iree_hal_resource_initialize(vtable, &command_buffer->resource);
   command_buffer->mode = mode;
   command_buffer->allowed_categories = command_categories;
   command_buffer->queue_affinity = queue_affinity;
   command_buffer->binding_capacity = binding_capacity;
+  command_buffer->binding_count = 0;
+  command_buffer->validation_state = validation_state;
 
   // Perform initialization validation after we allocate/initialize the concrete
   // implementation.
@@ -201,6 +231,10 @@
     if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "inline command buffers must be one-shot");
+    } else if (binding_capacity > 0) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "inline command buffers cannot have indirect bindings");
     }
   }
 
@@ -372,28 +406,26 @@
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   IREE_ASSERT_ARGUMENT(command_buffer);
-  IREE_ASSERT_ARGUMENT(buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
   IF_VALIDATING(command_buffer, {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_command_buffer_discard_buffer_validation(
-                command_buffer, VALIDATION_STATE(command_buffer), buffer));
+                command_buffer, VALIDATION_STATE(command_buffer), buffer_ref));
   });
-  iree_status_t status =
-      _VTABLE_DISPATCH(command_buffer, discard_buffer)(command_buffer, buffer);
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, discard_buffer)(
+      command_buffer, buffer_ref);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_fill_buffer(
-    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length,
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t target_ref,
     const void* pattern, iree_host_size_t pattern_length) {
   IREE_ASSERT_ARGUMENT(command_buffer);
-  IREE_ASSERT_ARGUMENT(target_buffer);
-  if (length == 0) {
+  if (target_ref.length == 0) {
     // No-op fill. All other validation is skipped.
     return iree_ok_status();
   }
@@ -401,24 +433,21 @@
   IF_VALIDATING(command_buffer, {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_command_buffer_fill_buffer_validation(
-                command_buffer, VALIDATION_STATE(command_buffer), target_buffer,
-                target_offset, length, pattern, pattern_length));
+                command_buffer, VALIDATION_STATE(command_buffer), target_ref,
+                pattern, pattern_length));
   });
   iree_status_t status = _VTABLE_DISPATCH(command_buffer, fill_buffer)(
-      command_buffer, target_buffer, target_offset, length, pattern,
-      pattern_length);
+      command_buffer, target_ref, pattern, pattern_length);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_update_buffer(
     iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   IREE_ASSERT_ARGUMENT(command_buffer);
   IREE_ASSERT_ARGUMENT(source_buffer);
-  IREE_ASSERT_ARGUMENT(target_buffer);
-  if (length == 0) {
+  if (target_ref.length == 0) {
     // No-op update. All other validation is skipped.
     return iree_ok_status();
   }
@@ -427,21 +456,19 @@
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_command_buffer_update_buffer_validation(
                 command_buffer, VALIDATION_STATE(command_buffer), source_buffer,
-                source_offset, target_buffer, target_offset, length));
+                source_offset, target_ref));
   });
   iree_status_t status = _VTABLE_DISPATCH(command_buffer, update_buffer)(
-      command_buffer, source_buffer, source_offset, target_buffer,
-      target_offset, length);
+      command_buffer, source_buffer, source_offset, target_ref);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_copy_buffer(
-    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* source_buffer,
-    iree_device_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t source_ref,
+    iree_hal_buffer_ref_t target_ref) {
   IREE_ASSERT_ARGUMENT(command_buffer);
-  if (length == 0) {
+  if (target_ref.length == 0) {
     // No-op copy. All other validation is skipped.
     return iree_ok_status();
   }
@@ -449,21 +476,19 @@
   IF_VALIDATING(command_buffer, {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_command_buffer_copy_buffer_validation(
-                command_buffer, VALIDATION_STATE(command_buffer), source_buffer,
-                source_offset, target_buffer, target_offset, length));
+                command_buffer, VALIDATION_STATE(command_buffer), source_ref,
+                target_ref));
   });
   iree_status_t status = _VTABLE_DISPATCH(command_buffer, copy_buffer)(
-      command_buffer, source_buffer, source_offset, target_buffer,
-      target_offset, length);
+      command_buffer, source_ref, target_ref);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_collective(
     iree_hal_command_buffer_t* command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   IREE_ASSERT_ARGUMENT(command_buffer);
   IREE_ASSERT_ARGUMENT(channel);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -471,7 +496,7 @@
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_command_buffer_collective_validation(
                 command_buffer, VALIDATION_STATE(command_buffer), channel, op,
-                param, send_binding, recv_binding, element_count));
+                param, send_ref, recv_ref, element_count));
   });
 #if IREE_HAL_VERBOSE_TRACING_ENABLE
   IREE_TRACE({
@@ -482,8 +507,7 @@
   });
 #endif  // IREE_HAL_VERBOSE_TRACING_ENABLE
   iree_status_t status = _VTABLE_DISPATCH(command_buffer, collective)(
-      command_buffer, channel, op, param, send_binding, recv_binding,
-      element_count);
+      command_buffer, channel, op, param, send_ref, recv_ref, element_count);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
@@ -514,8 +538,7 @@
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   IREE_ASSERT_ARGUMENT(command_buffer);
   IREE_ASSERT_ARGUMENT(pipeline_layout);
   IREE_ASSERT_ARGUMENT(!binding_count || bindings);
@@ -579,21 +602,18 @@
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   IREE_ASSERT_ARGUMENT(command_buffer);
   IREE_ASSERT_ARGUMENT(executable);
-  IREE_ASSERT_ARGUMENT(workgroups_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
   IF_VALIDATING(command_buffer, {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_command_buffer_dispatch_indirect_validation(
                 command_buffer, VALIDATION_STATE(command_buffer), executable,
-                entry_point, workgroups_buffer, workgroups_offset));
+                entry_point, workgroups_ref));
   });
   iree_status_t status = _VTABLE_DISPATCH(command_buffer, dispatch_indirect)(
-      command_buffer, executable, entry_point, workgroups_buffer,
-      workgroups_offset);
+      command_buffer, executable, entry_point, workgroups_ref);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
@@ -606,22 +626,34 @@
     iree_hal_command_buffer_t* command_buffer,
     const iree_hal_buffer_binding_table_t* binding_table) {
   IREE_ASSERT_ARGUMENT(command_buffer);
+
+  // Only check binding tables when one is required and otherwise ignore any
+  // bindings provided. Require at least as many bindings in the table as there
+  // are used by the command buffer. This may be less than the total capacity
+  // the command buffer was allocated with.
+  if (command_buffer->binding_count == 0) {
+    return iree_ok_status();
+  } else if (!binding_table) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "indirect command buffer requires at least %u "
+                            "bindings but no binding table was provided",
+                            command_buffer->binding_count);
+  } else if (binding_table->count < command_buffer->binding_count) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "indirect command buffer requires at least %u "
+                            "bindings but only %" PRIhsz " were provided ",
+                            command_buffer->binding_count,
+                            binding_table->count);
+  }
+
+  // Validate the binding table against the commands consuming them.
+  // This is O(binding_count) so something we only do if validation is
+  // requested on the command buffer.
   IF_VALIDATING(command_buffer, {
-    // Only check binding tables when one is required and otherwise ignore any
-    // bindings provided.
-    if (command_buffer->binding_capacity == 0) {
-      return iree_ok_status();
-    } else if (!binding_table ||
-               binding_table->count < command_buffer->binding_capacity) {
-      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                              "indirect command buffer requires at least %u "
-                              "bindings but only %" PRIhsz " were provided ",
-                              command_buffer->binding_capacity,
-                              binding_table ? binding_table->count : 0);
-    }
-    // TODO(benvanik): validate each binding against the requirements of the
-    // command buffer.
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_binding_table_validation(
+        command_buffer, VALIDATION_STATE(command_buffer), *binding_table));
   });
+
   return iree_ok_status();
 }
 
@@ -650,26 +682,30 @@
       switch (transfer_command->type) {
         case IREE_HAL_TRANSFER_COMMAND_TYPE_FILL:
           status = iree_hal_command_buffer_fill_buffer(
-              command_buffer, transfer_command->fill.target_buffer,
-              transfer_command->fill.target_offset,
-              transfer_command->fill.length, transfer_command->fill.pattern,
+              command_buffer,
+              iree_hal_make_buffer_ref(transfer_command->fill.target_buffer,
+                                       transfer_command->fill.target_offset,
+                                       transfer_command->fill.length),
+              transfer_command->fill.pattern,
               transfer_command->fill.pattern_length);
           break;
         case IREE_HAL_TRANSFER_COMMAND_TYPE_COPY:
           status = iree_hal_command_buffer_copy_buffer(
-              command_buffer, transfer_command->copy.source_buffer,
-              transfer_command->copy.source_offset,
-              transfer_command->copy.target_buffer,
-              transfer_command->copy.target_offset,
-              transfer_command->copy.length);
+              command_buffer,
+              iree_hal_make_buffer_ref(transfer_command->copy.source_buffer,
+                                       transfer_command->copy.source_offset,
+                                       transfer_command->copy.length),
+              iree_hal_make_buffer_ref(transfer_command->copy.target_buffer,
+                                       transfer_command->copy.target_offset,
+                                       transfer_command->copy.length));
           break;
         case IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE:
           status = iree_hal_command_buffer_update_buffer(
               command_buffer, transfer_command->update.source_buffer,
               transfer_command->update.source_offset,
-              transfer_command->update.target_buffer,
-              transfer_command->update.target_offset,
-              transfer_command->update.length);
+              iree_hal_make_buffer_ref(transfer_command->update.target_buffer,
+                                       transfer_command->update.target_offset,
+                                       transfer_command->update.length));
           break;
         default:
           status =
diff --git a/runtime/src/iree/hal/command_buffer.h b/runtime/src/iree/hal/command_buffer.h
index 8729c94..49c516e 100644
--- a/runtime/src/iree/hal/command_buffer.h
+++ b/runtime/src/iree/hal/command_buffer.h
@@ -105,11 +105,10 @@
   // When indirectly referencing a binding table buffer this will be added to
   // the base offset of the bound buffer.
   iree_device_size_t offset;
-  // Length, in bytes, of the buffer that is available to the executable.
+  // Length, in bytes, of the buffer after the offset that is accessed.
   // This can be IREE_WHOLE_BUFFER, however note that if the entire buffer
   // contents are larger than supported by the device (~128MiB, usually) this
-  // will fail. If the descriptor type is dynamic this will be used for all
-  // ranges regardless of offset.
+  // will fail.
   iree_device_size_t length;
 } iree_hal_buffer_ref_t;
 
@@ -206,46 +205,9 @@
   iree_hal_access_scope_t target_scope;
   // Buffer the barrier is restricted to.
   // The barrier will apply to the entire physical device allocation.
-  iree_hal_buffer_t* buffer;
-  // Relative offset/length within |buffer| (which may itself be mapped into the
-  // device allocation at an offset).
-  iree_device_size_t offset;
-  iree_device_size_t length;
+  iree_hal_buffer_ref_t buffer_ref;
 } iree_hal_buffer_barrier_t;
 
-// Specifies a descriptor set binding.
-// The range specified by [offset, length) will be made available to executables
-// on the given binding. If the descriptor type is dynamic then the range will
-// be [offset + dynamic_offset, length).
-//
-// The IREE HAL buffer type may internally be offset; such offset is applied
-// here as if it were the base address of the buffer. Note that the offset will
-// be applied at the time the binding is recording into the command buffer.
-//
-// Maps to VkDescriptorSetBinding.
-typedef struct iree_hal_descriptor_set_binding_t {
-  // The binding number of this entry and corresponds to a resource of the
-  // same binding number in the executable interface.
-  uint32_t binding : 8;
-  // Binding table slot the buffer will be sourced from if buffer is NULL.
-  // Only valid on command buffers that support indirect execution.
-  uint32_t buffer_slot : 24;
-  // Buffer bound to the binding number.
-  // If NULL then the buffer_slot will be used to resolve the buffer at command
-  // buffer execution time from the binding table.
-  iree_hal_buffer_t* buffer;
-  // Offset, in bytes, into the buffer that the binding starts at.
-  // When indirectly referencing a binding table buffer this will be added to
-  // the base offset of the bound buffer.
-  iree_device_size_t offset;
-  // Length, in bytes, of the buffer that is available to the executable.
-  // This can be IREE_WHOLE_BUFFER, however note that if the entire buffer
-  // contents are larger than supported by the device (~128MiB, usually) this
-  // will fail. If the descriptor type is dynamic this will be used for all
-  // ranges regardless of offset.
-  iree_device_size_t length;
-} iree_hal_descriptor_set_binding_t;
-
 // Specifies the type of collective operation.
 enum iree_hal_collective_kind_e {
   // Gathers N*|element_count| elements of the specified type in |recv_binding|
@@ -453,18 +415,6 @@
 IREE_API_EXPORT iree_string_view_t iree_hal_command_category_format(
     iree_hal_command_category_t value, iree_bitfield_string_temp_t* out_temp);
 
-// Storage for command buffer validation state.
-// Designed to be embedded in concrete implementations that want validation.
-typedef struct iree_hal_command_buffer_validation_state_t {
-  iree_hal_device_t* device;
-  // 1 when in a begin/end recording sequence.
-  uint32_t is_recording : 1;
-  // Debug group depth for tracking proper begin/end pairing.
-  int32_t debug_group_depth;
-  // TODO(benvanik): current pipeline layout/descriptor set layout info.
-  // TODO(benvanik): valid push constant bit ranges.
-} iree_hal_command_buffer_validation_state_t;
-
 // Maximum size of any update in iree_hal_command_buffer_update_buffer.
 // 64KB is the limit on Vulkan and we uniformly use that today across all
 // targets as to not need too much command buffer memory.
@@ -506,7 +456,7 @@
 //===----------------------------------------------------------------------===//
 
 // Asynchronous command buffer recording interface.
-// Commands are recorded by the implementation for later submission to command
+// Commands are recorded by the implementation for later submission to device
 // queues.
 //
 // Buffers, events, and programs referenced must remain valid and not be
@@ -517,6 +467,19 @@
 // will be retained for as long as the command buffer is live or until it is
 // reset.
 //
+// Buffers referenced by a command buffer may be either direct (a concrete
+// iree_hal_buffer_t reference) or indirect (a binding table slot ordinal).
+// Direct buffer references are embedded in the command buffer and cannot be
+// changed and the referenced resources will be kept live for as long as the
+// command buffer is live. Indirect references are placeholders indicating that
+// at the time the command buffer is submitted to a device queue a buffer will
+// be provided allowing for the same command buffer to be reused with different
+// buffers. Indirect command buffers are not concurrently schedulable unless
+// specified as many implementations need per submission shadow resources.
+// Validation of direct buffer references happens as the commands are recorded
+// and further validation is not required. Indirect buffer references are
+// validated upon submission with a populated binding table.
+//
 // Errors that can be recognized when operations are enqueued will be returned
 // immediately, such as invalid argument errors. Errors that can only be
 // determined at execution time will be returned on semaphores. Once a failure
@@ -656,7 +619,8 @@
 // This is because the discard may be used to elide write backs to host memory
 // or aggressively reuse the allocation for other purposes.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* buffer);
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_buffer_ref_t buffer_ref);
 
 // Fills the target buffer with the given repeating value.
 // Expects that |pattern_length| is one of 1, 2, or 4 and that the offset and
@@ -664,8 +628,7 @@
 // The target buffer must be compatible with the devices owned by this
 // device queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_fill_buffer(
-    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length,
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t target_ref,
     const void* pattern, iree_host_size_t pattern_length);
 
 // Updates a range of the given target buffer from the source host memory.
@@ -679,8 +642,7 @@
 // device queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_update_buffer(
     iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length);
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref);
 
 // Copies a range of one buffer to another.
 // Both buffers must be compatible with the devices owned by this device
@@ -691,18 +653,16 @@
 // This can be used to perform device->host, host->device, and device->device
 // copies.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_copy_buffer(
-    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* source_buffer,
-    iree_device_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length);
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t source_ref,
+    iree_hal_buffer_ref_t target_ref);
 
 // Dispatches a collective operation defined by |op| using the given buffers.
 // |param| must be specified for operations that require a root/peer rank
 // identifier and is otherwise ignored.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_collective(
     iree_hal_command_buffer_t* command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count);
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count);
 
 // Pushes an inline set of constants that can be accessed by subsequent
 // dispatches using a compatible pipeline layout.
@@ -715,18 +675,25 @@
     iree_hal_pipeline_layout_t* pipeline_layout, iree_host_size_t offset,
     const void* values, iree_host_size_t values_length);
 
-// Pushes a descriptor set and associates it with |set|.
+// Pushes descriptor set bindings and associates them with |set|.
 // This uses an internal ringbuffer inside of the command buffer to avoid the
 // need for creating and binding descriptor sets and managing their lifetime.
 //
-// The descriptor set will remain bound and valid so long as the executable
-// layouts used by dispatches are compatible (same descriptor layouts and push
-// constant sizes).
+// The |bindings| will remain bound and valid on the command buffer during
+// recording. Each binding must have its ordinal specified indicating which
+// descriptor set slots are being assigned.
+//
+// Provided bindings may have a buffer directly referenced that will be recorded
+// into the command buffer and kept live for the lifetime of the command buffer.
+// Alternatively bindings can reference slots in the binding table the capacity
+// of which was specified upon command buffer creation. Such indirect bindings
+// have their buffers specified upon submission and the buffers in the provided
+// binding table are kept live only until the submission referencing them
+// completes.
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings);
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings);
 
 // Dispatches an execution request.
 // The request may execute overlapped with any other transfer operation or
@@ -755,7 +722,7 @@
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer, iree_device_size_t workgroups_offset);
+    iree_hal_buffer_ref_t workgroups_ref);
 
 //===----------------------------------------------------------------------===//
 // Validation support
@@ -877,30 +844,27 @@
       const iree_hal_buffer_barrier_t* buffer_barriers);
 
   iree_status_t(IREE_API_PTR* discard_buffer)(
-      iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* buffer);
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_buffer_ref_t buffer_ref);
 
   iree_status_t(IREE_API_PTR* fill_buffer)(
       iree_hal_command_buffer_t* command_buffer,
-      iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-      iree_device_size_t length, const void* pattern,
+      iree_hal_buffer_ref_t target_ref, const void* pattern,
       iree_host_size_t pattern_length);
 
   iree_status_t(IREE_API_PTR* update_buffer)(
       iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
-      iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-      iree_device_size_t target_offset, iree_device_size_t length);
+      iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref);
 
   iree_status_t(IREE_API_PTR* copy_buffer)(
       iree_hal_command_buffer_t* command_buffer,
-      iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-      iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-      iree_device_size_t length);
+      iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref);
 
   iree_status_t(IREE_API_PTR* collective)(
       iree_hal_command_buffer_t* command_buffer, iree_hal_channel_t* channel,
       iree_hal_collective_op_t op, uint32_t param,
-      iree_hal_buffer_binding_t send_binding,
-      iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count);
+      iree_hal_buffer_ref_t send_ref, iree_hal_buffer_ref_t recv_ref,
+      iree_device_size_t element_count);
 
   iree_status_t(IREE_API_PTR* push_constants)(
       iree_hal_command_buffer_t* command_buffer,
@@ -910,8 +874,7 @@
   iree_status_t(IREE_API_PTR* push_descriptor_set)(
       iree_hal_command_buffer_t* command_buffer,
       iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-      iree_host_size_t binding_count,
-      const iree_hal_descriptor_set_binding_t* bindings);
+      iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings);
 
   iree_status_t(IREE_API_PTR* dispatch)(
       iree_hal_command_buffer_t* command_buffer,
@@ -921,8 +884,7 @@
   iree_status_t(IREE_API_PTR* dispatch_indirect)(
       iree_hal_command_buffer_t* command_buffer,
       iree_hal_executable_t* executable, int32_t entry_point,
-      iree_hal_buffer_t* workgroups_buffer,
-      iree_device_size_t workgroups_offset);
+      iree_hal_buffer_ref_t workgroups_ref);
 } iree_hal_command_buffer_vtable_t;
 IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_command_buffer_vtable_t);
 
@@ -932,17 +894,20 @@
   iree_hal_command_category_t allowed_categories;
   iree_hal_queue_affinity_t queue_affinity;
   uint32_t binding_capacity;
-
-#if IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
-  iree_hal_command_buffer_validation_state_t validation;
-#endif  // IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+  uint32_t binding_count;
+  void* validation_state;
 };
 
+// Returns the total size of the additional command buffer storage required for
+// validating the command buffer. Returns 0 if no validation state is required.
+IREE_API_EXPORT iree_host_size_t iree_hal_command_buffer_validation_state_size(
+    iree_hal_command_buffer_mode_t mode, iree_host_size_t binding_capacity);
+
 IREE_API_EXPORT void iree_hal_command_buffer_initialize(
     iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
-    const iree_hal_command_buffer_vtable_t* vtable,
+    void* validation_state, const iree_hal_command_buffer_vtable_t* vtable,
     iree_hal_command_buffer_t* command_buffer);
 
 IREE_API_EXPORT void iree_hal_command_buffer_destroy(
diff --git a/runtime/src/iree/hal/command_buffer_validation.c b/runtime/src/iree/hal/command_buffer_validation.c
index 80648f0..6ae820a 100644
--- a/runtime/src/iree/hal/command_buffer_validation.c
+++ b/runtime/src/iree/hal/command_buffer_validation.c
@@ -23,7 +23,7 @@
 // Returns success iff the queue supports the given command categories.
 static iree_status_t iree_hal_command_buffer_validate_categories(
     const iree_hal_command_buffer_t* command_buffer,
-    iree_hal_command_buffer_validation_state_t* validation_state,
+    const iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_command_category_t required_categories) {
   if (IREE_UNLIKELY(!validation_state->is_recording)) {
     return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
@@ -54,7 +54,7 @@
 // Returns success iff the buffer is compatible with the device.
 static iree_status_t iree_hal_command_buffer_validate_buffer_compatibility(
     const iree_hal_command_buffer_t* command_buffer,
-    iree_hal_command_buffer_validation_state_t* validation_state,
+    const iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_buffer_t* buffer,
     iree_hal_buffer_compatibility_t required_compatibility,
     iree_hal_buffer_usage_t intended_usage) {
@@ -89,11 +89,117 @@
   return iree_ok_status();
 }
 
+static iree_status_t iree_hal_command_buffer_validate_binding_requirements(
+    iree_hal_command_buffer_t* command_buffer,
+    const iree_hal_command_buffer_validation_state_t* validation_state,
+    iree_hal_buffer_binding_t binding,
+    iree_hal_buffer_binding_requirements_t requirements) {
+  // Check for binding presence.
+  if (requirements.usage == IREE_HAL_BUFFER_USAGE_NONE) {
+    // Binding slot is unused and its value in the table is ignored.
+    return iree_ok_status();
+  } else if (!binding.buffer) {
+    // Binding is used and required.
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "binding table slot requires a buffer but none was provided");
+  }
+
+  // Ensure the buffer is compatible with the device.
+  // NOTE: this check is very slow! We may want to disable this outside of debug
+  // mode or try to fast path it if the buffer is known-good.
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
+      command_buffer, validation_state, binding.buffer,
+      requirements.required_compatibility, requirements.usage));
+
+  // Verify buffer compatibility.
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
+      iree_hal_buffer_allowed_usage(binding.buffer), requirements.usage));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
+      iree_hal_buffer_allowed_access(binding.buffer), requirements.access));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(binding.buffer), requirements.type));
+
+  // Verify that the binding range is valid and that any commands that reference
+  // it are in range.
+  if (requirements.max_byte_offset > 0) {
+    iree_device_size_t end = binding.offset + requirements.max_byte_offset;
+    if (IREE_UNLIKELY(end > binding.length)) {
+      return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                              "at least one command attempted to access an "
+                              "address outside of the valid bound buffer "
+                              "range (length=%" PRIdsz ", end(inc)=%" PRIdsz
+                              ", binding offset=%" PRIdsz
+                              ", binding length=%" PRIdsz ")",
+                              requirements.max_byte_offset, end - 1,
+                              binding.offset, binding.length);
+    }
+  }
+
+  // Ensure the offset and length have an alignment matching the value length.
+  if (requirements.min_byte_alignment &&
+      (binding.offset % requirements.min_byte_alignment) != 0) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "binding offset does not match the required "
+                            "alignment of one or more command (offset=%" PRIdsz
+                            ", min_byte_alignment=%" PRIhsz ")",
+                            binding.offset, requirements.min_byte_alignment);
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_command_buffer_validate_buffer_requirements(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_command_buffer_validation_state_t* validation_state,
+    iree_hal_buffer_ref_t buffer_ref,
+    iree_hal_buffer_binding_requirements_t requirements) {
+  // If the buffer is directly specified we can validate it inline.
+  if (buffer_ref.buffer) {
+    iree_hal_buffer_binding_t binding = {
+        .buffer = buffer_ref.buffer,
+        .offset = 0,
+        .length = buffer_ref.offset + buffer_ref.length,
+    };
+    return iree_hal_command_buffer_validate_binding_requirements(
+        command_buffer, validation_state, binding, requirements);
+  }
+
+  // Ensure the buffer binding table slot is within range. Note that the
+  // binding table provided may have more bindings than required so we only
+  // verify against the declared command buffer capacity.
+  if (IREE_UNLIKELY(buffer_ref.buffer_slot >=
+                    command_buffer->binding_capacity)) {
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "indirect buffer reference slot %u is out range of the declared "
+        "binding capacity of the command buffer %u",
+        buffer_ref.buffer_slot, command_buffer->binding_capacity);
+  }
+  command_buffer->binding_count =
+      iree_max(command_buffer->binding_count, buffer_ref.buffer_slot + 1);
+
+  // Merge the binding requirements into the table.
+  iree_hal_buffer_binding_requirements_t* table_requirements =
+      &validation_state->binding_requirements[buffer_ref.buffer_slot];
+  table_requirements->required_compatibility |=
+      requirements.required_compatibility;
+  table_requirements->usage |= requirements.usage;
+  table_requirements->access |= requirements.access;
+  table_requirements->type |= requirements.type;
+  table_requirements->max_byte_offset = iree_max(
+      table_requirements->max_byte_offset, requirements.max_byte_offset);
+  table_requirements->min_byte_alignment = iree_device_size_lcm(
+      table_requirements->min_byte_alignment, requirements.min_byte_alignment);
+
+  return iree_ok_status();
+}
+
 // Returns success iff the currently bound descriptor sets are valid for the
 // given executable entry point.
 static iree_status_t iree_hal_command_buffer_validate_dispatch_bindings(
     iree_hal_command_buffer_t* command_buffer,
-    iree_hal_command_buffer_validation_state_t* validation_state,
+    const iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_executable_t* executable, int32_t entry_point) {
   // TODO(benvanik): validate buffers referenced have compatible memory types
   // and access rights.
@@ -106,6 +212,7 @@
     iree_hal_command_buffer_validation_state_t* out_validation_state) {
   out_validation_state->device = device;
   out_validation_state->is_recording = false;
+  out_validation_state->debug_group_depth = 0;
 }
 
 iree_status_t iree_hal_command_buffer_begin_validation(
@@ -210,13 +317,16 @@
 iree_status_t iree_hal_command_buffer_discard_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_t* buffer) {
+    iree_hal_buffer_ref_t buffer_ref) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
 
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
-      iree_hal_buffer_memory_type(buffer),
-      IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE));
+  const iree_hal_buffer_binding_requirements_t buffer_reqs = {
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+      .max_byte_offset = buffer_ref.offset + buffer_ref.length,
+  };
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_requirements(
+      command_buffer, validation_state, buffer_ref, buffer_reqs));
 
   return iree_ok_status();
 }
@@ -224,27 +334,10 @@
 iree_status_t iree_hal_command_buffer_fill_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
-  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
-      command_buffer, validation_state, target_buffer,
-      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
-      IREE_HAL_BUFFER_USAGE_TRANSFER_TARGET));
-
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
-      iree_hal_buffer_memory_type(target_buffer),
-      IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE));
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
-      iree_hal_buffer_allowed_access(target_buffer),
-      IREE_HAL_MEMORY_ACCESS_WRITE));
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
-      iree_hal_buffer_allowed_usage(target_buffer),
-      IREE_HAL_BUFFER_USAGE_TRANSFER_TARGET));
-  IREE_RETURN_IF_ERROR(
-      iree_hal_buffer_validate_range(target_buffer, target_offset, length));
 
   // Ensure the value length is supported.
   if (pattern_length != 1 && pattern_length != 2 && pattern_length != 4) {
@@ -254,16 +347,27 @@
                             pattern_length);
   }
 
-  // Ensure the offset and length have an alignment matching the value length.
-  if ((target_offset % pattern_length) != 0 || (length % pattern_length) != 0) {
+  if ((target_ref.offset % pattern_length) != 0 ||
+      (target_ref.length % pattern_length) != 0) {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
-        "fill offset and/or length do not match the natural alignment of the "
-        "fill value (target_offset=%" PRIdsz ", length=%" PRIdsz
+        "binding offset and/or length do not match the required alignment of "
+        "one or more command (offset=%" PRIdsz ", length=%" PRIdsz
         ", pattern_length=%" PRIhsz ")",
-        target_offset, length, pattern_length);
+        target_ref.offset, target_ref.length, pattern_length);
   }
 
+  const iree_hal_buffer_binding_requirements_t target_reqs = {
+      .required_compatibility = IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
+      .usage = IREE_HAL_BUFFER_USAGE_TRANSFER_TARGET,
+      .access = IREE_HAL_MEMORY_ACCESS_WRITE,
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+      .max_byte_offset = target_ref.offset + target_ref.length,
+      .min_byte_alignment = pattern_length,
+  };
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_requirements(
+      command_buffer, validation_state, target_ref, target_reqs));
+
   return iree_ok_status();
 }
 
@@ -271,26 +375,19 @@
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     const void* source_buffer, iree_host_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t target_ref) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
-  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
-      command_buffer, validation_state, target_buffer,
-      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
-      IREE_HAL_BUFFER_USAGE_TRANSFER_TARGET));
 
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
-      iree_hal_buffer_memory_type(target_buffer),
-      IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE));
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
-      iree_hal_buffer_allowed_access(target_buffer),
-      IREE_HAL_MEMORY_ACCESS_WRITE));
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
-      iree_hal_buffer_allowed_usage(target_buffer),
-      IREE_HAL_BUFFER_USAGE_TRANSFER_TARGET));
-  IREE_RETURN_IF_ERROR(
-      iree_hal_buffer_validate_range(target_buffer, target_offset, length));
+  const iree_hal_buffer_binding_requirements_t target_reqs = {
+      .required_compatibility = IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
+      .usage = IREE_HAL_BUFFER_USAGE_TRANSFER_TARGET,
+      .access = IREE_HAL_MEMORY_ACCESS_WRITE,
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+      .max_byte_offset = target_ref.offset + target_ref.length,
+  };
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_requirements(
+      command_buffer, validation_state, target_ref, target_reqs));
 
   return iree_ok_status();
 }
@@ -298,65 +395,45 @@
 iree_status_t iree_hal_command_buffer_copy_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
-  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
-      command_buffer, validation_state, source_buffer,
-      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
-      IREE_HAL_BUFFER_USAGE_TRANSFER_SOURCE));
-  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
-      command_buffer, validation_state, target_buffer,
-      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
-      IREE_HAL_BUFFER_USAGE_TRANSFER_TARGET));
 
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
-      iree_hal_buffer_allowed_access(source_buffer),
-      IREE_HAL_MEMORY_ACCESS_READ));
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
-      iree_hal_buffer_allowed_usage(source_buffer),
-      IREE_HAL_BUFFER_USAGE_TRANSFER_SOURCE));
-  IREE_RETURN_IF_ERROR(
-      iree_hal_buffer_validate_range(source_buffer, source_offset, length));
-
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
-      iree_hal_buffer_allowed_usage(target_buffer),
-      IREE_HAL_BUFFER_USAGE_TRANSFER_TARGET));
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
-      iree_hal_buffer_allowed_access(target_buffer),
-      IREE_HAL_MEMORY_ACCESS_WRITE));
-  IREE_RETURN_IF_ERROR(
-      iree_hal_buffer_validate_range(target_buffer, target_offset, length));
-
-  // At least source or destination must be device-visible to enable
-  // host->device, device->host, and device->device.
-  // TODO(benvanik): host->host copies.
-  if (!iree_any_bit_set(iree_hal_buffer_memory_type(source_buffer),
-                        IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE) &&
-      !iree_any_bit_set(iree_hal_buffer_memory_type(target_buffer),
-                        IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
-#if IREE_STATUS_MODE
-    iree_bitfield_string_temp_t temp0, temp1;
-    iree_string_view_t source_memory_type_str = iree_hal_memory_type_format(
-        iree_hal_buffer_memory_type(source_buffer), &temp0);
-    iree_string_view_t target_memory_type_str = iree_hal_memory_type_format(
-        iree_hal_buffer_memory_type(target_buffer), &temp1);
-    return iree_make_status(
-        IREE_STATUS_PERMISSION_DENIED,
-        "at least one buffer must be device-visible for a copy; "
-        "source_buffer=%.*s, target_buffer=%.*s",
-        (int)source_memory_type_str.size, source_memory_type_str.data,
-        (int)target_memory_type_str.size, target_memory_type_str.data);
-#else
-    return iree_status_from_code(IREE_STATUS_PERMISSION_DENIED);
-#endif  // IREE_STATUS_MODE
+  if (source_ref.length != target_ref.length) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "copy spans between source and target must match "
+                            "(source_length=%" PRIdsz ", target_length=%" PRIdsz
+                            ")",
+                            source_ref.length, target_ref.length);
   }
 
+  const iree_hal_buffer_binding_requirements_t source_reqs = {
+      .required_compatibility = IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
+      .usage = IREE_HAL_BUFFER_USAGE_TRANSFER_SOURCE,
+      .access = IREE_HAL_MEMORY_ACCESS_READ,
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+      .max_byte_offset = source_ref.offset + source_ref.length,
+  };
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_requirements(
+      command_buffer, validation_state, source_ref, source_reqs));
+
+  const iree_hal_buffer_binding_requirements_t target_reqs = {
+      .required_compatibility = IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
+      .usage = IREE_HAL_BUFFER_USAGE_TRANSFER_TARGET,
+      .access = IREE_HAL_MEMORY_ACCESS_WRITE,
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+      .max_byte_offset = target_ref.offset + target_ref.length,
+  };
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_requirements(
+      command_buffer, validation_state, target_ref, target_reqs));
+
   // Check for overlap - just like memcpy we don't handle that.
-  if (iree_hal_buffer_test_overlap(source_buffer, source_offset, length,
-                                   target_buffer, target_offset, length) !=
+  // Note that it's only undefined behavior if violated so we are ok if tricky
+  // situations (subspans of subspans of binding table subranges etc) make it
+  // through.
+  if (iree_hal_buffer_test_overlap(source_ref.buffer, source_ref.offset,
+                                   source_ref.length, target_ref.buffer,
+                                   target_ref.offset, target_ref.length) !=
       IREE_HAL_BUFFER_OVERLAP_DISJOINT) {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
@@ -370,8 +447,8 @@
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_channel_t* channel, iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_buffer_ref_t send_ref, iree_hal_buffer_ref_t recv_ref,
+    iree_device_size_t element_count) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
 
@@ -430,17 +507,15 @@
 
   // TODO(benvanik): add queue cap/usage for COLLECTIVE source/dest?
   if (info_bits & IREE_HAL_COLLECTIVE_REQUIRES_SEND_BINDING) {
-    if (!send_binding.buffer) {
-      return iree_make_status(
-          IREE_STATUS_INVALID_ARGUMENT,
-          "collective operation requires a send buffer binding");
-    } else {
-      IREE_RETURN_IF_ERROR(
-          iree_hal_command_buffer_validate_buffer_compatibility(
-              command_buffer, validation_state, send_binding.buffer,
-              IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH,
-              IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE_READ));
-    }
+    const iree_hal_buffer_binding_requirements_t send_reqs = {
+        .required_compatibility = IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH,
+        .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE_READ,
+        .access = IREE_HAL_MEMORY_ACCESS_READ,
+        .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+        .max_byte_offset = send_ref.offset + send_ref.length,
+    };
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_requirements(
+        command_buffer, validation_state, send_ref, send_reqs));
   } else {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
@@ -448,17 +523,15 @@
   }
 
   if (info_bits & IREE_HAL_COLLECTIVE_REQUIRES_RECV_BINDING) {
-    if (!recv_binding.buffer) {
-      return iree_make_status(
-          IREE_STATUS_INVALID_ARGUMENT,
-          "collective operation requires a recv buffer binding");
-    } else {
-      IREE_RETURN_IF_ERROR(
-          iree_hal_command_buffer_validate_buffer_compatibility(
-              command_buffer, validation_state, recv_binding.buffer,
-              IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH,
-              IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE_WRITE));
-    }
+    const iree_hal_buffer_binding_requirements_t recv_reqs = {
+        .required_compatibility = IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH,
+        .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE_WRITE,
+        .access = IREE_HAL_MEMORY_ACCESS_WRITE,
+        .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+        .max_byte_offset = recv_ref.offset + recv_ref.length,
+    };
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_requirements(
+        command_buffer, validation_state, recv_ref, recv_reqs));
   } else {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
@@ -491,13 +564,28 @@
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
 
   // TODO(benvanik): validate set index.
 
+  // TODO(benvanik): use pipeline layout to derive usage and access bits.
+  iree_hal_buffer_binding_requirements_t requirements = {
+      .required_compatibility = IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH,
+      // .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_...,
+      // .access = IREE_HAL_MEMORY_ACCESS_...,
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+  };
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    // TODO(benvanik): validate binding ordinal against pipeline layout.
+    requirements.max_byte_offset = bindings[i].offset + bindings[i].length;
+    IREE_RETURN_IF_ERROR(
+        iree_hal_command_buffer_validate_buffer_requirements(
+            command_buffer, validation_state, bindings[i], requirements),
+        "set[%u] binding[%u] (arg[%" PRIhsz "])", set, bindings[i].ordinal, i);
+  }
+
   return iree_ok_status();
 }
 
@@ -517,29 +605,59 @@
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
       command_buffer, validation_state, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
-  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
-      command_buffer, validation_state, workgroups_buffer,
-      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH,
-      IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS));
 
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
-      iree_hal_buffer_memory_type(workgroups_buffer),
-      IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE));
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
-      iree_hal_buffer_allowed_access(workgroups_buffer),
-      IREE_HAL_MEMORY_ACCESS_READ));
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
-      iree_hal_buffer_allowed_usage(workgroups_buffer),
-      IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS));
-  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_range(
-      workgroups_buffer, workgroups_offset, sizeof(uint32_t) * 3));
+  if ((workgroups_ref.offset % sizeof(uint32_t)) != 0) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "workgroup count offset does not match the required natural alignment "
+        "of uint32_t (offset=%" PRIdsz ", min_byte_alignment=%" PRIhsz ")",
+        workgroups_ref.offset, sizeof(uint32_t));
+  } else if (workgroups_ref.length < 3 * sizeof(uint32_t)) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "workgroup count buffer does not have the capacity "
+                            "to store the required 3 uint32_t values "
+                            "(length=%" PRIdsz ", min_length=%" PRIhsz ")",
+                            workgroups_ref.length, 3 * sizeof(uint32_t));
+  }
+
+  const iree_hal_buffer_binding_requirements_t workgroups_reqs = {
+      .required_compatibility = IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH,
+      .usage = IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS,
+      .access = IREE_HAL_MEMORY_ACCESS_READ,
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+      .max_byte_offset = workgroups_ref.offset + workgroups_ref.length,
+      .min_byte_alignment = sizeof(uint32_t),
+  };
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_requirements(
+      command_buffer, validation_state, workgroups_ref, workgroups_reqs));
 
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_dispatch_bindings(
       command_buffer, validation_state, executable, entry_point));
 
   return iree_ok_status();
 }
+
+iree_status_t iree_hal_command_buffer_binding_table_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    const iree_hal_command_buffer_validation_state_t* validation_state,
+    iree_hal_buffer_binding_table_t binding_table) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, command_buffer->binding_count);
+
+  // NOTE: we only validate from [0, binding_count) and don't care if there are
+  // extra bindings present.
+  for (uint32_t i = 0; i < command_buffer->binding_count; ++i) {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_hal_command_buffer_validate_binding_requirements(
+            command_buffer, validation_state, binding_table.bindings[i],
+            validation_state->binding_requirements[i]),
+        "binding table slot %u", i);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/hal/command_buffer_validation.h b/runtime/src/iree/hal/command_buffer_validation.h
index 33e502d..4a40b49 100644
--- a/runtime/src/iree/hal/command_buffer_validation.h
+++ b/runtime/src/iree/hal/command_buffer_validation.h
@@ -10,6 +10,36 @@
 #include "iree/base/api.h"
 #include "iree/hal/command_buffer.h"
 
+// Requirements for a buffer resource used within a command buffer.
+// Buffers bound to must have all bits set from the included bitfields and
+// support the given min/max byte offsets as in-range.
+typedef struct iree_hal_buffer_binding_requirements_t {
+  iree_hal_buffer_compatibility_t required_compatibility;
+  iree_hal_buffer_usage_t usage;
+  iree_hal_memory_access_t access;
+  iree_hal_memory_type_t type;
+  // Maximum offset in the binding referenced by any command.
+  iree_device_size_t max_byte_offset;
+  // Minimum required alignment by at least one command.
+  iree_device_size_t min_byte_alignment;
+} iree_hal_buffer_binding_requirements_t;
+
+// Storage for command buffer validation state.
+// Designed to be embedded in concrete implementations that want validation.
+typedef struct iree_hal_command_buffer_validation_state_t {
+  iree_hal_device_t* device;
+  // 1 when in a begin/end recording sequence.
+  int32_t is_recording : 1;
+  // Debug group depth for tracking proper begin/end pairing.
+  int32_t debug_group_depth : 31;
+  // TODO(benvanik): current pipeline layout/descriptor set layout info.
+  // TODO(benvanik): valid push constant bit ranges.
+  // Requirements for each binding table entry.
+  // Unused slots in the binding table will have IREE_HAL_BUFFER_USAGE_NONE and
+  // are ignored if set when executed.
+  iree_hal_buffer_binding_requirements_t binding_requirements[0];
+} iree_hal_command_buffer_validation_state_t;
+
 void iree_hal_command_buffer_initialize_validation(
     iree_hal_device_t* device, iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* out_validation_state);
@@ -67,35 +97,31 @@
 iree_status_t iree_hal_command_buffer_discard_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_t* buffer);
+    iree_hal_buffer_ref_t buffer_ref);
 
 iree_status_t iree_hal_command_buffer_fill_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length);
 
 iree_status_t iree_hal_command_buffer_update_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     const void* source_buffer, iree_host_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length);
+    iree_hal_buffer_ref_t target_ref);
 
 iree_status_t iree_hal_command_buffer_copy_buffer_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length);
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref);
 
 iree_status_t iree_hal_command_buffer_collective_validation(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_channel_t* channel, iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count);
+    iree_hal_buffer_ref_t send_ref, iree_hal_buffer_ref_t recv_ref,
+    iree_device_size_t element_count);
 
 iree_status_t iree_hal_command_buffer_push_constants_validation(
     iree_hal_command_buffer_t* command_buffer,
@@ -107,8 +133,7 @@
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings);
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings);
 
 iree_status_t iree_hal_command_buffer_dispatch_validation(
     iree_hal_command_buffer_t* command_buffer,
@@ -120,6 +145,11 @@
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_command_buffer_validation_state_t* validation_state,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer, iree_device_size_t workgroups_offset);
+    iree_hal_buffer_ref_t workgroups_ref);
+
+iree_status_t iree_hal_command_buffer_binding_table_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    const iree_hal_command_buffer_validation_state_t* validation_state,
+    iree_hal_buffer_binding_table_t binding_table);
 
 #endif  // IREE_HAL_COMMAND_BUFFER_VALIDATION_H_
diff --git a/runtime/src/iree/hal/cts/command_buffer_dispatch_test.h b/runtime/src/iree/hal/cts/command_buffer_dispatch_test.h
index 9ef7c3c..bea9b8e 100644
--- a/runtime/src/iree/hal/cts/command_buffer_dispatch_test.h
+++ b/runtime/src/iree/hal/cts/command_buffer_dispatch_test.h
@@ -112,7 +112,7 @@
   IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
       device_allocator_, output_params, sizeof(float), &output_buffer));
 
-  iree_hal_descriptor_set_binding_t descriptor_set_bindings[] = {
+  iree_hal_buffer_ref_t descriptor_set_bindings[] = {
       {
           /*binding=*/0,
           /*buffer_slot=*/0,
diff --git a/runtime/src/iree/hal/cts/command_buffer_push_constants_test.h b/runtime/src/iree/hal/cts/command_buffer_push_constants_test.h
index 2a8d5a0..cfe4362 100644
--- a/runtime/src/iree/hal/cts/command_buffer_push_constants_test.h
+++ b/runtime/src/iree/hal/cts/command_buffer_push_constants_test.h
@@ -101,7 +101,7 @@
   IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
       device_allocator_, output_params, 4 * sizeof(uint32_t), &output_buffer));
 
-  iree_hal_descriptor_set_binding_t descriptor_set_bindings[] = {
+  iree_hal_buffer_ref_t descriptor_set_bindings[] = {
       {
           /*binding=*/0,
           /*buffer_slot=*/0,
diff --git a/runtime/src/iree/hal/cts/command_buffer_test.h b/runtime/src/iree/hal/cts/command_buffer_test.h
index fcd1d29..2c5c984 100644
--- a/runtime/src/iree/hal/cts/command_buffer_test.h
+++ b/runtime/src/iree/hal/cts/command_buffer_test.h
@@ -62,8 +62,9 @@
 
     // Fill the pattern.
     IREE_CHECK_OK(iree_hal_command_buffer_fill_buffer(
-        command_buffer, device_buffer, target_offset, fill_length, pattern,
-        pattern_length));
+        command_buffer,
+        iree_hal_make_buffer_ref(device_buffer, target_offset, fill_length),
+        pattern, pattern_length));
     IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
     IREE_CHECK_OK(SubmitCommandBufferAndWait(command_buffer));
 
@@ -164,9 +165,10 @@
   // Copy the host buffer to the device buffer.
   IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
   IREE_ASSERT_OK(iree_hal_command_buffer_copy_buffer(
-      command_buffer, /*source_buffer=*/host_buffer, /*source_offset=*/0,
-      /*target_buffer=*/device_buffer, /*target_offset=*/0,
-      /*length=*/kDefaultAllocationSize));
+      command_buffer, /*source_ref=*/
+      iree_hal_make_buffer_ref(host_buffer, 0, kDefaultAllocationSize),
+      /*target_ref=*/
+      iree_hal_make_buffer_ref(device_buffer, 0, kDefaultAllocationSize)));
   IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
 
   IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer));
@@ -230,16 +232,25 @@
   uint8_t zero_val = 0x0;
   IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
   IREE_ASSERT_OK(iree_hal_command_buffer_fill_buffer(
-      command_buffer, device_buffer, /*target_offset=*/0, /*length=*/8,
+      command_buffer,
+      iree_hal_make_buffer_ref(device_buffer, /*target_offset=*/0,
+                               /*length=*/8),
       &zero_val, /*pattern_length=*/sizeof(zero_val)));
   IREE_ASSERT_OK(iree_hal_command_buffer_copy_buffer(
-      command_buffer, /*source_buffer=*/host_buffer, /*source_offset=*/4,
-      /*target_buffer=*/device_buffer, /*target_offset=*/8,
-      /*length=*/kDefaultAllocationSize / 2 - 4));
+      command_buffer,
+      iree_hal_make_buffer_ref(/*source_buffer=*/host_buffer,
+                               /*source_offset=*/4,
+                               /*length=*/kDefaultAllocationSize / 2 - 4),
+      iree_hal_make_buffer_ref(/*target_buffer=*/device_buffer,
+                               /*target_offset=*/8,
+                               /*length=*/kDefaultAllocationSize / 2 - 4)));
   IREE_ASSERT_OK(iree_hal_command_buffer_fill_buffer(
-      command_buffer, device_buffer,
-      /*target_offset=*/8 + kDefaultAllocationSize / 2 - 4,
-      /*length=*/kDefaultAllocationSize - (8 + kDefaultAllocationSize / 2 - 4),
+      command_buffer,
+      iree_hal_make_buffer_ref(
+          device_buffer,
+          /*target_offset=*/8 + kDefaultAllocationSize / 2 - 4,
+          /*length=*/kDefaultAllocationSize -
+              (8 + kDefaultAllocationSize / 2 - 4)),
       &zero_val,
       /*pattern_length=*/sizeof(zero_val)));
   IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
@@ -477,8 +488,9 @@
 
   // Issue the update_buffer command.
   IREE_CHECK_OK(iree_hal_command_buffer_update_buffer(
-      command_buffer, source_buffer.data(), /*source_offset=*/0, device_buffer,
-      /*target_offset=*/0, /*length=*/target_buffer_size));
+      command_buffer,
+      /*source_buffer=*/source_buffer.data(), /*source_offset=*/0,
+      iree_hal_make_buffer_ref(device_buffer, 0, target_buffer_size)));
   IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
   IREE_CHECK_OK(SubmitCommandBufferAndWait(command_buffer));
 
@@ -513,8 +525,10 @@
 
   // Issue the update_buffer command.
   IREE_CHECK_OK(iree_hal_command_buffer_update_buffer(
-      command_buffer, source_buffer.data(), /*source_offset=*/4, device_buffer,
-      /*target_offset=*/4, /*length=*/8));
+      command_buffer,
+      /*source_buffer=*/source_buffer.data(), /*source_offset=*/4,
+      iree_hal_make_buffer_ref(device_buffer,
+                               /*target_offset=*/4, /*length=*/8)));
   IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
   IREE_CHECK_OK(SubmitCommandBufferAndWait(command_buffer));
 
@@ -559,8 +573,10 @@
 
   // Issue the update_buffer command.
   IREE_CHECK_OK(iree_hal_command_buffer_update_buffer(
-      command_buffer, source_buffer.data(), /*source_offset=*/4, buffer_subspan,
-      /*target_offset=*/4, /*length=*/4));
+      command_buffer,
+      /*source_buffer=*/source_buffer.data(), /*source_offset=*/4,
+      iree_hal_make_buffer_ref(buffer_subspan,
+                               /*target_offset=*/4, /*length=*/4)));
   IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
   IREE_CHECK_OK(SubmitCommandBufferAndWait(command_buffer));
 
diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
index 879573a..c891d18 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
@@ -541,20 +541,33 @@
   iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
 
   switch (device->params.command_buffer_mode) {
-    case IREE_HAL_CUDA_COMMAND_BUFFER_MODE_GRAPH:
-      return iree_hal_cuda_graph_command_buffer_create(
-          base_device, device->cuda_symbols, device->tracing_context,
-          device->cu_context, mode, command_categories, queue_affinity,
-          binding_capacity, &device->block_pool, device->host_allocator,
-          out_command_buffer);
-    case IREE_HAL_CUDA_COMMAND_BUFFER_MODE_STREAM:
+    case IREE_HAL_CUDA_COMMAND_BUFFER_MODE_GRAPH: {
+      // TODO(indirect-cmd): when we can record indirect graphs we won't need
+      // to use deferred command buffers - this is here to emulate indirect
+      // command buffers.
+      if (binding_capacity > 0) {
+        return iree_hal_deferred_command_buffer_create(
+            base_device, mode, command_categories, binding_capacity,
+            &device->block_pool, iree_hal_device_host_allocator(base_device),
+            out_command_buffer);
+      } else {
+        return iree_hal_cuda_graph_command_buffer_create(
+            base_device, device->cuda_symbols, device->tracing_context,
+            device->cu_context, mode, command_categories, queue_affinity,
+            binding_capacity, &device->block_pool, device->host_allocator,
+            out_command_buffer);
+      }
+    }
+    case IREE_HAL_CUDA_COMMAND_BUFFER_MODE_STREAM: {
       return iree_hal_deferred_command_buffer_create(
           base_device, mode, command_categories, binding_capacity,
           &device->block_pool, iree_hal_device_host_allocator(base_device),
           out_command_buffer);
-    default:
+    }
+    default: {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "invalid command buffer mode");
+    }
   }
 }
 
@@ -767,7 +780,7 @@
       device->pending_queue_actions,
       iree_hal_cuda_device_collect_tracing_context, device->tracing_context,
       wait_semaphore_list, signal_semaphore_list, command_buffer_count,
-      command_buffers);
+      command_buffers, binding_tables);
   if (iree_status_is_ok(status)) {
     // Try to advance the pending workload queue.
     status = iree_hal_cuda_pending_queue_actions_issue(
diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
index c5748b6..de2de99 100644
--- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
@@ -177,11 +177,16 @@
 
   iree_hal_cuda_graph_command_buffer_t* command_buffer = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_allocator_malloc(host_allocator, sizeof(*command_buffer),
-                                (void**)&command_buffer));
+      z0,
+      iree_allocator_malloc(host_allocator,
+                            sizeof(*command_buffer) +
+                                iree_hal_command_buffer_validation_state_size(
+                                    mode, binding_capacity),
+                            (void**)&command_buffer));
 
   iree_hal_command_buffer_initialize(
       device, mode, command_categories, queue_affinity, binding_capacity,
+      (uint8_t*)command_buffer + sizeof(*command_buffer),
       &iree_hal_cuda_graph_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
   command_buffer->symbols = cuda_symbols;
@@ -448,7 +453,8 @@
 }
 
 static iree_status_t iree_hal_cuda_graph_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   // We could mark the memory as invalidated so that if this is a managed buffer
   // CUDA does not try to copy it back to the host.
   return iree_ok_status();
@@ -478,8 +484,7 @@
 
 static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
@@ -491,17 +496,19 @@
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
-                                       &target_buffer));
+                                       &target_ref.buffer));
 
   CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
   uint32_t pattern_4byte = iree_hal_cuda_splat_pattern(pattern, pattern_length);
+
   CUDA_MEMSET_NODE_PARAMS params = {
       .dst = target_device_buffer + target_offset,
       .elementSize = pattern_length,
-      .pitch = 0,                        // unused if height == 1
-      .width = length / pattern_length,  // element count
+      .pitch = 0,                                   // unused if height == 1
+      .width = target_ref.length / pattern_length,  // element count
       .height = 1,
       .value = pattern_4byte,
   };
@@ -528,8 +535,7 @@
 
 static iree_status_t iree_hal_cuda_graph_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -546,23 +552,25 @@
   // operation and get the wrong data.
   uint8_t* storage = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0,
-      iree_arena_allocate(&command_buffer->arena, length, (void**)&storage));
-  memcpy(storage, (const uint8_t*)source_buffer + source_offset, length);
+      z0, iree_arena_allocate(&command_buffer->arena, target_ref.length,
+                              (void**)&storage));
+  memcpy(storage, (const uint8_t*)source_buffer + source_offset,
+         target_ref.length);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
-                                       &target_buffer));
+                                       &target_ref.buffer));
 
   CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
   CUDA_MEMCPY3D params = {
       .srcMemoryType = CU_MEMORYTYPE_HOST,
       .srcHost = storage,
       .dstMemoryType = CU_MEMORYTYPE_DEVICE,
       .dstDevice = target_device_buffer,
-      .dstXInBytes = iree_hal_buffer_byte_offset(target_buffer) + target_offset,
-      .WidthInBytes = length,
+      .dstXInBytes =
+          iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset,
+      .WidthInBytes = target_ref.length,
       .Height = 1,
       .Depth = 1,
   };
@@ -589,9 +597,7 @@
 
 static iree_status_t iree_hal_cuda_graph_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -600,17 +606,20 @@
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer));
 
-  const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+  const iree_hal_buffer_t* resources[2] = {source_ref.buffer,
+                                           target_ref.buffer};
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0,
-      iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+      z0, iree_hal_resource_set_insert(command_buffer->resource_set,
+                                       IREE_ARRAYSIZE(resources), resources));
 
-  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
   CUdeviceptr source_device_buffer = iree_hal_cuda_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(source_buffer));
-  source_offset += iree_hal_buffer_byte_offset(source_buffer);
+      iree_hal_buffer_allocated_buffer(source_ref.buffer));
+  iree_device_size_t source_offset =
+      iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset;
+  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
 
   CUDA_MEMCPY3D params = {
       .srcMemoryType = CU_MEMORYTYPE_DEVICE,
@@ -619,7 +628,7 @@
       .dstMemoryType = CU_MEMORYTYPE_DEVICE,
       .dstDevice = target_device_buffer,
       .dstXInBytes = target_offset,
-      .WidthInBytes = length,
+      .WidthInBytes = target_ref.length,
       .Height = 1,
       .Depth = 1,
   };
@@ -646,11 +655,20 @@
 
 static iree_status_t iree_hal_cuda_graph_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+  iree_hal_buffer_binding_t send_binding = {
+      .buffer = send_ref.buffer,
+      .offset = send_ref.offset,
+      .length = send_ref.length,
+  };
+  iree_hal_buffer_binding_t recv_binding = {
+      .buffer = recv_ref.buffer,
+      .offset = recv_ref.offset,
+      .length = recv_ref.length,
+  };
   return iree_hal_collective_batch_append(&command_buffer->collective_batch,
                                           channel, op, param, send_binding,
                                           recv_binding, element_count);
@@ -673,8 +691,7 @@
 static iree_status_t iree_hal_cuda_graph_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   if (binding_count > IREE_HAL_CUDA_MAX_DESCRIPTOR_SET_BINDING_COUNT) {
     return iree_make_status(
         IREE_STATUS_RESOURCE_EXHAUSTED,
@@ -689,7 +706,7 @@
 
   CUdeviceptr* current_bindings = command_buffer->descriptor_sets[set].bindings;
   for (iree_host_size_t i = 0; i < binding_count; i++) {
-    const iree_hal_descriptor_set_binding_t* binding = &bindings[i];
+    const iree_hal_buffer_ref_t* binding = &bindings[i];
     CUdeviceptr device_ptr = 0;
     if (binding->buffer) {
       IREE_RETURN_AND_END_ZONE_IF_ERROR(
@@ -701,7 +718,7 @@
       iree_device_size_t offset = iree_hal_buffer_byte_offset(binding->buffer);
       device_ptr = device_buffer + offset + binding->offset;
     }
-    current_bindings[binding->binding] = device_ptr;
+    current_bindings[binding->ordinal] = device_ptr;
   }
 
   IREE_TRACE_ZONE_END(z0);
@@ -837,8 +854,7 @@
 static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "indirect dispatch not yet implemented");
 }
diff --git a/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c b/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c
index 3b28011..33d2d02 100644
--- a/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c
+++ b/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.c
@@ -74,8 +74,9 @@
   union {
     struct {
       iree_host_size_t count;
-      iree_hal_command_buffer_t** ptr;
-    } command_buffers;
+      iree_hal_command_buffer_t** command_buffers;
+      iree_hal_buffer_binding_table_t* binding_tables;
+    } execution;
   } payload;
 
   // The device from which to allocate CUDA stream-based command buffers for
@@ -431,58 +432,6 @@
         .destroy = iree_hal_cuda_pending_queue_actions_destroy,
 };
 
-// Copies of the given |in_list| to |out_list| to retain the command buffer
-// list.
-static iree_status_t iree_hal_cuda_copy_command_buffer_list(
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* in_list, iree_allocator_t host_allocator,
-    iree_hal_command_buffer_t*** out_list) {
-  *out_list = NULL;
-  if (!command_buffer_count) return iree_ok_status();
-
-  iree_host_size_t total_size = command_buffer_count * sizeof(*in_list);
-  IREE_RETURN_IF_ERROR(
-      iree_allocator_malloc(host_allocator, total_size, (void**)out_list));
-  memcpy((void*)*out_list, in_list, total_size);
-  return iree_ok_status();
-}
-
-// Frees the semaphore and value list inside |semaphore_list|.
-static void iree_hal_cuda_free_command_buffer_list(
-    iree_allocator_t host_allocator,
-    iree_hal_command_buffer_t* const* command_buffer_list) {
-  iree_allocator_free(host_allocator, (void*)command_buffer_list);
-}
-
-// Copies of the given |in_list| to |out_list| to retain the semaphore and value
-// list.
-static iree_status_t iree_hal_cuda_copy_semaphore_list(
-    iree_hal_semaphore_list_t in_list, iree_allocator_t host_allocator,
-    iree_hal_semaphore_list_t* out_list) {
-  memset(out_list, 0, sizeof(*out_list));
-  if (!in_list.count) return iree_ok_status();
-
-  out_list->count = in_list.count;
-  iree_host_size_t semaphore_size = in_list.count * sizeof(*in_list.semaphores);
-  IREE_RETURN_IF_ERROR(iree_allocator_malloc(host_allocator, semaphore_size,
-                                             (void**)&out_list->semaphores));
-  memcpy(out_list->semaphores, in_list.semaphores, semaphore_size);
-
-  iree_host_size_t value_size = in_list.count * sizeof(*in_list.payload_values);
-  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
-      host_allocator, value_size, (void**)&out_list->payload_values));
-  memcpy(out_list->payload_values, in_list.payload_values, value_size);
-  return iree_ok_status();
-}
-
-// Frees the semaphore and value list inside |semaphore_list|.
-static void iree_hal_cuda_free_semaphore_list(
-    iree_allocator_t host_allocator,
-    iree_hal_semaphore_list_t* semaphore_list) {
-  iree_allocator_free(host_allocator, semaphore_list->semaphores);
-  iree_allocator_free(host_allocator, semaphore_list->payload_values);
-}
-
 static void iree_hal_cuda_queue_action_destroy(
     iree_hal_cuda_queue_action_t* action) {
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -496,10 +445,6 @@
 
   // Only release resources after callbacks have been issued.
   iree_hal_resource_set_free(action->resource_set);
-  iree_hal_cuda_free_semaphore_list(host_allocator,
-                                    &action->wait_semaphore_list);
-  iree_hal_cuda_free_semaphore_list(host_allocator,
-                                    &action->signal_semaphore_list);
 
   iree_hal_cuda_queue_action_clear_events(action);
 
@@ -510,7 +455,7 @@
   IREE_TRACE_ZONE_END(z0);
 }
 
-static void decrement_work_items_count(
+static void iree_hal_cuda_queue_decrement_work_items_count(
     iree_hal_cuda_working_area_t* working_area) {
   iree_slim_mutex_lock(&working_area->pending_work_items_count_mutex);
   --working_area->pending_work_items_count;
@@ -531,15 +476,36 @@
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers) {
+    iree_hal_command_buffer_t* const* command_buffers,
+    iree_hal_buffer_binding_table_t const* binding_tables) {
   IREE_ASSERT_ARGUMENT(actions);
   IREE_ASSERT_ARGUMENT(command_buffer_count == 0 || command_buffers);
   IREE_TRACE_ZONE_BEGIN(z0);
 
+  // Embed captured tables in the action allocation.
   iree_hal_cuda_queue_action_t* action = NULL;
+  const iree_host_size_t wait_semaphore_list_size =
+      wait_semaphore_list.count * sizeof(*wait_semaphore_list.semaphores) +
+      wait_semaphore_list.count * sizeof(*wait_semaphore_list.payload_values);
+  const iree_host_size_t signal_semaphore_list_size =
+      signal_semaphore_list.count * sizeof(*signal_semaphore_list.semaphores) +
+      signal_semaphore_list.count *
+          sizeof(*signal_semaphore_list.payload_values);
+  const iree_host_size_t command_buffers_size =
+      command_buffer_count * sizeof(*action->payload.execution.command_buffers);
+  const iree_host_size_t binding_tables_size =
+      binding_tables ? command_buffer_count *
+                           sizeof(*action->payload.execution.binding_tables)
+                     : 0;
+  const iree_host_size_t payload_size =
+      command_buffers_size + binding_tables_size;
+  const iree_host_size_t total_action_size =
+      sizeof(*action) + wait_semaphore_list_size + signal_semaphore_list_size +
+      payload_size;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_allocator_malloc(actions->host_allocator, sizeof(*action),
+      z0, iree_allocator_malloc(actions->host_allocator, total_action_size,
                                 (void**)&action));
+  uint8_t* action_ptr = (uint8_t*)action + sizeof(*action);
 
   action->owning_actions = actions;
   action->state = IREE_HAL_CUDA_QUEUE_ACTION_STATE_ALIVE;
@@ -554,51 +520,67 @@
   action->event_count = 0;
   action->is_pending = true;
 
+  // Copy wait list for later access.
+  action->wait_semaphore_list.count = wait_semaphore_list.count;
+  action->wait_semaphore_list.semaphores = (iree_hal_semaphore_t**)action_ptr;
+  memcpy(action->wait_semaphore_list.semaphores, wait_semaphore_list.semaphores,
+         wait_semaphore_list.count * sizeof(*wait_semaphore_list.semaphores));
+  action->wait_semaphore_list.payload_values =
+      (uint64_t*)(action_ptr + wait_semaphore_list.count *
+                                   sizeof(*wait_semaphore_list.semaphores));
+  memcpy(
+      action->wait_semaphore_list.payload_values,
+      wait_semaphore_list.payload_values,
+      wait_semaphore_list.count * sizeof(*wait_semaphore_list.payload_values));
+  action_ptr += wait_semaphore_list_size;
+
+  // Copy signal list for later access.
+  action->signal_semaphore_list.count = signal_semaphore_list.count;
+  action->signal_semaphore_list.semaphores = (iree_hal_semaphore_t**)action_ptr;
+  memcpy(
+      action->signal_semaphore_list.semaphores,
+      signal_semaphore_list.semaphores,
+      signal_semaphore_list.count * sizeof(*signal_semaphore_list.semaphores));
+  action->signal_semaphore_list.payload_values =
+      (uint64_t*)(action_ptr + signal_semaphore_list.count *
+                                   sizeof(*signal_semaphore_list.semaphores));
+  memcpy(action->signal_semaphore_list.payload_values,
+         signal_semaphore_list.payload_values,
+         signal_semaphore_list.count *
+             sizeof(*signal_semaphore_list.payload_values));
+  action_ptr += signal_semaphore_list_size;
+
+  // Copy the execution resources for later access.
+  action->payload.execution.count = command_buffer_count;
+  action->payload.execution.command_buffers =
+      (iree_hal_command_buffer_t**)action_ptr;
+  memcpy(action->payload.execution.command_buffers, command_buffers,
+         command_buffers_size);
+  action_ptr += command_buffers_size;
+  action->payload.execution.binding_tables =
+      (iree_hal_buffer_binding_table_t*)action_ptr;
+  memcpy(action->payload.execution.binding_tables, binding_tables,
+         binding_tables_size);
+  action_ptr += binding_tables_size;
+
   // Retain all command buffers and semaphores.
-  iree_hal_resource_set_t* resource_set = NULL;
-  iree_status_t status =
-      iree_hal_resource_set_allocate(actions->block_pool, &resource_set);
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status = iree_hal_resource_set_insert(resource_set, command_buffer_count,
-                                          command_buffers);
+  iree_status_t status = iree_hal_resource_set_allocate(actions->block_pool,
+                                                        &action->resource_set);
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_resource_set_insert(action->resource_set,
+                                          wait_semaphore_list.count,
+                                          wait_semaphore_list.semaphores);
   }
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status =
-        iree_hal_resource_set_insert(resource_set, wait_semaphore_list.count,
-                                     wait_semaphore_list.semaphores);
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_resource_set_insert(action->resource_set,
+                                          signal_semaphore_list.count,
+                                          signal_semaphore_list.semaphores);
   }
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status =
-        iree_hal_resource_set_insert(resource_set, signal_semaphore_list.count,
-                                     signal_semaphore_list.semaphores);
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_resource_set_insert(
+        action->resource_set, command_buffer_count, command_buffers);
   }
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    action->resource_set = resource_set;
-  }
-
-  // Copy the command buffer list for later access.
-  // TODO: avoid host allocator malloc; use some pool for the allocation.
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    action->payload.command_buffers.count = command_buffer_count;
-    status = iree_hal_cuda_copy_command_buffer_list(
-        command_buffer_count, command_buffers, actions->host_allocator,
-        &action->payload.command_buffers.ptr);
-  }
-
-  // Copy the semaphore and value list for later access.
-  // TODO: avoid host allocator malloc; use some pool for the allocation.
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status = iree_hal_cuda_copy_semaphore_list(wait_semaphore_list,
-                                               actions->host_allocator,
-                                               &action->wait_semaphore_list);
-  }
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status = iree_hal_cuda_copy_semaphore_list(signal_semaphore_list,
-                                               actions->host_allocator,
-                                               &action->signal_semaphore_list);
-  }
-
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
+  if (iree_status_is_ok(status)) {
     // Retain the owning queue to make sure the action outlives it.
     iree_hal_resource_retain(actions);
 
@@ -607,13 +589,7 @@
     iree_hal_cuda_queue_action_list_push_back(&actions->action_list, action);
     iree_slim_mutex_unlock(&actions->action_mutex);
   } else {
-    iree_hal_cuda_free_semaphore_list(actions->host_allocator,
-                                      &action->wait_semaphore_list);
-    iree_hal_cuda_free_semaphore_list(actions->host_allocator,
-                                      &action->signal_semaphore_list);
-    iree_hal_cuda_free_command_buffer_list(actions->host_allocator,
-                                           action->payload.command_buffers.ptr);
-    iree_hal_resource_set_free(resource_set);
+    iree_hal_resource_set_free(action->resource_set);
     iree_allocator_free(actions->host_allocator, action);
   }
 
@@ -684,7 +660,7 @@
   }
 
   // The callback (work item) is complete.
-  decrement_work_items_count(&actions->working_area);
+  iree_hal_cuda_queue_decrement_work_items_count(&actions->working_area);
 
   IREE_TRACE_ZONE_END(z0);
 }
@@ -712,16 +688,19 @@
   }
 
   // Then launch all command buffers to the dispatch stream.
-  IREE_TRACE_ZONE_BEGIN(dispatch_command_buffers);
-  IREE_TRACE_ZONE_APPEND_TEXT(dispatch_command_buffers,
-                              " dispatch_command_buffers",
-                              strlen(" dispatch_command_buffers"));
-  for (iree_host_size_t i = 0; i < action->payload.command_buffers.count; ++i) {
+  IREE_TRACE_ZONE_BEGIN(z_dispatch_command_buffers);
+  IREE_TRACE_ZONE_APPEND_TEXT(z_dispatch_command_buffers,
+                              "dispatch_command_buffers");
+  for (iree_host_size_t i = 0; i < action->payload.execution.count; ++i) {
     iree_hal_command_buffer_t* command_buffer =
-        action->payload.command_buffers.ptr[i];
+        action->payload.execution.command_buffers[i];
+    iree_hal_buffer_binding_table_t binding_table =
+        action->payload.execution.binding_tables
+            ? action->payload.execution.binding_tables[i]
+            : iree_hal_buffer_binding_table_empty();
     if (iree_hal_cuda_graph_command_buffer_isa(command_buffer)) {
-      CUgraphExec exec = iree_hal_cuda_graph_command_buffer_handle(
-          action->payload.command_buffers.ptr[i]);
+      CUgraphExec exec =
+          iree_hal_cuda_graph_command_buffer_handle(command_buffer);
       IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR(
           z0, symbols, cuGraphLaunch(exec, action->dispatch_cu_stream),
           "cuGraphLaunch");
@@ -736,19 +715,12 @@
                   action->device, mode, IREE_HAL_COMMAND_CATEGORY_ANY,
                   /*binding_capacity=*/0, &stream_command_buffer));
       IREE_RETURN_AND_END_ZONE_IF_ERROR(
-          z0, iree_hal_resource_set_insert(action->resource_set, 1,
-                                           &stream_command_buffer));
-      IREE_RETURN_AND_END_ZONE_IF_ERROR(
           z0, iree_hal_deferred_command_buffer_apply(
-                  command_buffer, stream_command_buffer,
-                  iree_hal_buffer_binding_table_empty()));
-      // The stream_command_buffer is going to be retained by
-      // the action->resource_set and deleted after the action
-      // completes.
+                  command_buffer, stream_command_buffer, binding_table));
       iree_hal_resource_release(stream_command_buffer);
     }
   }
-  IREE_TRACE_ZONE_END(dispatch_command_buffers);
+  IREE_TRACE_ZONE_END(z_dispatch_command_buffers);
 
   // Last record CUevent signals in the dispatch stream.
   for (iree_host_size_t i = 0; i < action->signal_semaphore_list.count; ++i) {
@@ -802,7 +774,7 @@
 
   // Now we fully executed and cleaned up this action. Decrease the work items
   // counter.
-  decrement_work_items_count(&actions->working_area);
+  iree_hal_cuda_queue_decrement_work_items_count(&actions->working_area);
 
   IREE_TRACE_ZONE_END(z0);
 }
diff --git a/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.h b/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.h
index 02a2996..b889428 100644
--- a/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.h
+++ b/runtime/src/iree/hal/drivers/cuda/pending_queue_actions.h
@@ -65,7 +65,8 @@
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers);
+    iree_hal_command_buffer_t* const* command_buffers,
+    iree_hal_buffer_binding_table_t const* binding_tables);
 
 // Tries to scan the pending actions and release ready ones to the GPU.
 iree_status_t iree_hal_cuda_pending_queue_actions_issue(
diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
index 784d906..6533cc3 100644
--- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
@@ -83,13 +83,17 @@
 
   iree_hal_cuda_stream_command_buffer_t* command_buffer = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_allocator_malloc(host_allocator, sizeof(*command_buffer),
-                                (void**)&command_buffer));
+      z0,
+      iree_allocator_malloc(host_allocator,
+                            sizeof(*command_buffer) +
+                                iree_hal_command_buffer_validation_state_size(
+                                    mode, binding_capacity),
+                            (void**)&command_buffer));
 
   iree_hal_command_buffer_initialize(
       device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
-      binding_capacity, &iree_hal_cuda_stream_command_buffer_vtable,
-      &command_buffer->base);
+      binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
+      &iree_hal_cuda_stream_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
   command_buffer->cuda_symbols = cuda_symbols;
   command_buffer->nccl_symbols = nccl_symbols;
@@ -291,7 +295,8 @@
 }
 
 static iree_status_t iree_hal_cuda_stream_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   // We could mark the memory as invalidated so that if managed CUDA does not
   // try to copy it back to the host.
   return iree_ok_status();
@@ -299,8 +304,7 @@
 
 static iree_status_t iree_hal_cuda_stream_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   iree_hal_cuda_stream_command_buffer_t* command_buffer =
       iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
@@ -311,11 +315,11 @@
       iree_hal_cuda_stream_command_buffer_flush_collectives(command_buffer));
 
   CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
   CUdeviceptr dst = target_device_buffer + target_offset;
-  size_t num_elements = length / pattern_length;
-
+  size_t num_elements = target_ref.length / pattern_length;
   switch (pattern_length) {
     case 4: {
       IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR(
@@ -353,8 +357,7 @@
 
 static iree_status_t iree_hal_cuda_stream_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_cuda_stream_command_buffer_t* command_buffer =
       iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -373,20 +376,21 @@
   if (command_buffer->arena.block_pool) {
     uint8_t* storage = NULL;
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0,
-        iree_arena_allocate(&command_buffer->arena, length, (void**)&storage));
-    memcpy(storage, src, length);
+        z0, iree_arena_allocate(&command_buffer->arena, target_ref.length,
+                                (void**)&storage));
+    memcpy(storage, src, target_ref.length);
     src = storage;
   }
 
   // Issue the copy using the scratch memory as the source.
   CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
   CUdeviceptr dst = target_device_buffer +
-                    iree_hal_buffer_byte_offset(target_buffer) + target_offset;
+                    iree_hal_buffer_byte_offset(target_ref.buffer) +
+                    target_ref.offset;
   IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR(
       z0, command_buffer->cuda_symbols,
-      cuMemcpyHtoDAsync(dst, src, length, command_buffer->cu_stream),
+      cuMemcpyHtoDAsync(dst, src, target_ref.length, command_buffer->cu_stream),
       "cuMemcpyHtoDAsync");
 
   IREE_TRACE_ZONE_END(z0);
@@ -395,9 +399,7 @@
 
 static iree_status_t iree_hal_cuda_stream_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   iree_hal_cuda_stream_command_buffer_t* command_buffer =
       iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -406,18 +408,20 @@
       z0,
       iree_hal_cuda_stream_command_buffer_flush_collectives(command_buffer));
 
-  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
   CUdeviceptr source_device_buffer = iree_hal_cuda_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(source_buffer));
-  source_offset += iree_hal_buffer_byte_offset(source_buffer);
-  CUdeviceptr dst = target_device_buffer + target_offset;
+      iree_hal_buffer_allocated_buffer(source_ref.buffer));
+  iree_device_size_t source_offset =
+      iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset;
+  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
   CUdeviceptr src = source_device_buffer + source_offset;
+  CUdeviceptr dst = target_device_buffer + target_offset;
 
   IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR(
       z0, command_buffer->cuda_symbols,
-      cuMemcpyAsync(dst, src, length, command_buffer->cu_stream),
+      cuMemcpyAsync(dst, src, target_ref.length, command_buffer->cu_stream),
       "cuMemcpyAsync");
 
   IREE_TRACE_ZONE_END(z0);
@@ -426,13 +430,22 @@
 
 static iree_status_t iree_hal_cuda_stream_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   iree_hal_cuda_stream_command_buffer_t* command_buffer =
       iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
+  iree_hal_buffer_binding_t send_binding = {
+      .buffer = send_ref.buffer,
+      .offset = send_ref.offset,
+      .length = send_ref.length,
+  };
+  iree_hal_buffer_binding_t recv_binding = {
+      .buffer = recv_ref.buffer,
+      .offset = recv_ref.offset,
+      .length = recv_ref.length,
+  };
   iree_status_t status = iree_hal_collective_batch_append(
       &command_buffer->collective_batch, channel, op, param, send_binding,
       recv_binding, element_count);
@@ -462,8 +475,7 @@
 static iree_status_t iree_hal_cuda_stream_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   if (binding_count > IREE_HAL_CUDA_MAX_DESCRIPTOR_SET_BINDING_COUNT) {
     return iree_make_status(
         IREE_STATUS_RESOURCE_EXHAUSTED,
@@ -478,19 +490,18 @@
 
   CUdeviceptr* current_bindings = command_buffer->descriptor_sets[set].bindings;
   for (iree_host_size_t i = 0; i < binding_count; i++) {
-    const iree_hal_descriptor_set_binding_t* binding = &bindings[i];
+    const iree_hal_buffer_ref_t* binding = &bindings[i];
     CUdeviceptr device_ptr = 0;
     if (binding->buffer) {
       IREE_RETURN_AND_END_ZONE_IF_ERROR(
           z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
                                            &binding->buffer));
-
       CUdeviceptr device_buffer = iree_hal_cuda_buffer_device_pointer(
           iree_hal_buffer_allocated_buffer(binding->buffer));
       iree_device_size_t offset = iree_hal_buffer_byte_offset(binding->buffer);
       device_ptr = device_buffer + offset + binding->offset;
     }
-    current_bindings[binding->binding] = device_ptr;
+    current_bindings[binding->ordinal] = device_ptr;
   }
 
   IREE_TRACE_ZONE_END(z0);
@@ -612,8 +623,7 @@
 static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "need cuda implementation of dispatch indirect");
 }
diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
index 64ab55c..7ba006c 100644
--- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
@@ -180,11 +180,16 @@
 
   iree_hal_hip_graph_command_buffer_t* command_buffer = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_allocator_malloc(host_allocator, sizeof(*command_buffer),
-                                (void**)&command_buffer));
+      z0,
+      iree_allocator_malloc(host_allocator,
+                            sizeof(*command_buffer) +
+                                iree_hal_command_buffer_validation_state_size(
+                                    mode, binding_capacity),
+                            (void**)&command_buffer));
 
   iree_hal_command_buffer_initialize(
       device, mode, command_categories, queue_affinity, binding_capacity,
+      (uint8_t*)command_buffer + sizeof(*command_buffer),
       &iree_hal_hip_graph_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
   command_buffer->symbols = hip_symbols;
@@ -452,7 +457,8 @@
 }
 
 static iree_status_t iree_hal_hip_graph_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   // We could mark the memory as invalidated so that if this is a managed buffer
   // HIP does not try to copy it back to the host.
   return iree_ok_status();
@@ -482,8 +488,7 @@
 
 static iree_status_t iree_hal_hip_graph_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   iree_hal_hip_graph_command_buffer_t* command_buffer =
       iree_hal_hip_graph_command_buffer_cast(base_command_buffer);
@@ -495,17 +500,18 @@
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
-                                       &target_buffer));
+                                       &target_ref.buffer));
 
   hipDeviceptr_t target_device_buffer = iree_hal_hip_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
   uint32_t pattern_4byte = iree_hal_hip_splat_pattern(pattern, pattern_length);
   hipMemsetParams params = {
       .dst = (uint8_t*)target_device_buffer + target_offset,
       .elementSize = pattern_length,
-      .pitch = 0,                        // unused if height == 1
-      .width = length / pattern_length,  // element count
+      .pitch = 0,                                   // unused if height == 1
+      .width = target_ref.length / pattern_length,  // element count
       .height = 1,
       .value = pattern_4byte,
   };
@@ -532,8 +538,7 @@
 
 static iree_status_t iree_hal_hip_graph_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_hip_graph_command_buffer_t* command_buffer =
       iree_hal_hip_graph_command_buffer_cast(base_command_buffer);
   if (command_buffer->symbols->hipDrvGraphAddMemcpyNode == NULL) {
@@ -555,24 +560,26 @@
   // operation and get the wrong data.
   uint8_t* storage = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0,
-      iree_arena_allocate(&command_buffer->arena, length, (void**)&storage));
-  memcpy(storage, (const uint8_t*)source_buffer + source_offset, length);
+      z0, iree_arena_allocate(&command_buffer->arena, target_ref.length,
+                              (void**)&storage));
+  memcpy(storage, (const uint8_t*)source_buffer + source_offset,
+         target_ref.length);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
-                                       &target_buffer));
+                                       &target_ref.buffer));
 
   hipDeviceptr_t target_device_buffer = iree_hal_hip_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
 
   HIP_MEMCPY3D params = {
       .srcMemoryType = hipMemoryTypeHost,
       .srcHost = storage,
       .dstMemoryType = hipMemoryTypeDevice,
       .dstDevice = target_device_buffer,
-      .dstXInBytes = iree_hal_buffer_byte_offset(target_buffer) + target_offset,
-      .WidthInBytes = length,
+      .dstXInBytes =
+          iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset,
+      .WidthInBytes = target_ref.length,
       .Height = 1,
       .Depth = 1,
   };
@@ -599,9 +606,7 @@
 
 static iree_status_t iree_hal_hip_graph_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   iree_hal_hip_graph_command_buffer_t* command_buffer =
       iree_hal_hip_graph_command_buffer_cast(base_command_buffer);
   if (command_buffer->symbols->hipDrvGraphAddMemcpyNode == NULL) {
@@ -615,17 +620,19 @@
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer));
 
-  const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+  const iree_hal_buffer_t* buffers[2] = {source_ref.buffer, target_ref.buffer};
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0,
       iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
 
   hipDeviceptr_t target_device_buffer = iree_hal_hip_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
   hipDeviceptr_t source_device_buffer = iree_hal_hip_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(source_buffer));
-  source_offset += iree_hal_buffer_byte_offset(source_buffer);
+      iree_hal_buffer_allocated_buffer(source_ref.buffer));
+  iree_device_size_t source_offset =
+      iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset;
 
   HIP_MEMCPY3D params = {
       .srcMemoryType = hipMemoryTypeDevice,
@@ -634,7 +641,7 @@
       .dstMemoryType = hipMemoryTypeDevice,
       .dstDevice = target_device_buffer,
       .dstXInBytes = target_offset,
-      .WidthInBytes = length,
+      .WidthInBytes = target_ref.length,
       .Height = 1,
       .Depth = 1,
   };
@@ -661,11 +668,20 @@
 
 static iree_status_t iree_hal_hip_graph_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   iree_hal_hip_graph_command_buffer_t* command_buffer =
       iree_hal_hip_graph_command_buffer_cast(base_command_buffer);
+  iree_hal_buffer_binding_t send_binding = {
+      .buffer = send_ref.buffer,
+      .offset = send_ref.offset,
+      .length = send_ref.length,
+  };
+  iree_hal_buffer_binding_t recv_binding = {
+      .buffer = recv_ref.buffer,
+      .offset = recv_ref.offset,
+      .length = recv_ref.length,
+  };
   return iree_hal_collective_batch_append(&command_buffer->collective_batch,
                                           channel, op, param, send_binding,
                                           recv_binding, element_count);
@@ -694,8 +710,7 @@
 static iree_status_t iree_hal_hip_graph_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   if (binding_count > IREE_HAL_HIP_MAX_DESCRIPTOR_SET_BINDING_COUNT) {
     return iree_make_status(
         IREE_STATUS_RESOURCE_EXHAUSTED,
@@ -710,7 +725,7 @@
   hipDeviceptr_t* current_bindings =
       command_buffer->descriptor_sets[set].bindings;
   for (iree_host_size_t i = 0; i < binding_count; i++) {
-    const iree_hal_descriptor_set_binding_t* binding = &bindings[i];
+    const iree_hal_buffer_ref_t* binding = &bindings[i];
     hipDeviceptr_t device_ptr = NULL;
     if (binding->buffer) {
       IREE_RETURN_AND_END_ZONE_IF_ERROR(
@@ -723,7 +738,7 @@
       device_ptr = (uint8_t*)device_buffer + offset + binding->offset;
     }
 
-    current_bindings[binding->binding] = device_ptr;
+    current_bindings[binding->ordinal] = device_ptr;
   }
 
   IREE_TRACE_ZONE_END(z0);
@@ -844,8 +859,7 @@
 static iree_status_t iree_hal_hip_graph_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "indirect dispatch not yet implemented");
 }
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index 13e41a7..dfd1b20 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -774,7 +774,7 @@
       device->pending_queue_actions,
       iree_hal_hip_device_collect_tracing_context, device->tracing_context,
       wait_semaphore_list, signal_semaphore_list, command_buffer_count,
-      command_buffers);
+      command_buffers, binding_tables);
   if (iree_status_is_ok(status)) {
     // Try to advance the pending workload queue.
     status =
diff --git a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
index a18c4b8..c225c17 100644
--- a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
+++ b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
@@ -75,8 +75,9 @@
   union {
     struct {
       iree_host_size_t count;
-      iree_hal_command_buffer_t** ptr;
-    } command_buffers;
+      iree_hal_command_buffer_t** command_buffers;
+      iree_hal_buffer_binding_table_t* binding_tables;
+    } execution;
   } payload;
 
   // The device from which to allocate HIP stream-based command buffers for
@@ -432,58 +433,6 @@
         .destroy = iree_hal_hip_pending_queue_actions_destroy,
 };
 
-// Copies of the given |in_list| to |out_list| to retain the command buffer
-// list.
-static iree_status_t iree_hal_hip_copy_command_buffer_list(
-    iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* in_list, iree_allocator_t host_allocator,
-    iree_hal_command_buffer_t*** out_list) {
-  *out_list = NULL;
-  if (!command_buffer_count) return iree_ok_status();
-
-  iree_host_size_t total_size = command_buffer_count * sizeof(*in_list);
-  IREE_RETURN_IF_ERROR(
-      iree_allocator_malloc(host_allocator, total_size, (void**)out_list));
-  memcpy((void*)*out_list, in_list, total_size);
-  return iree_ok_status();
-}
-
-// Frees the semaphore and value list inside |semaphore_list|.
-static void iree_hal_hip_free_command_buffer_list(
-    iree_allocator_t host_allocator,
-    iree_hal_command_buffer_t* const* command_buffer_list) {
-  iree_allocator_free(host_allocator, (void*)command_buffer_list);
-}
-
-// Copies of the given |in_list| to |out_list| to retain the semaphore and value
-// list.
-static iree_status_t iree_hal_hip_copy_semaphore_list(
-    iree_hal_semaphore_list_t in_list, iree_allocator_t host_allocator,
-    iree_hal_semaphore_list_t* out_list) {
-  memset(out_list, 0, sizeof(*out_list));
-  if (!in_list.count) return iree_ok_status();
-
-  out_list->count = in_list.count;
-  iree_host_size_t semaphore_size = in_list.count * sizeof(*in_list.semaphores);
-  IREE_RETURN_IF_ERROR(iree_allocator_malloc(host_allocator, semaphore_size,
-                                             (void**)&out_list->semaphores));
-  memcpy(out_list->semaphores, in_list.semaphores, semaphore_size);
-
-  iree_host_size_t value_size = in_list.count * sizeof(*in_list.payload_values);
-  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
-      host_allocator, value_size, (void**)&out_list->payload_values));
-  memcpy(out_list->payload_values, in_list.payload_values, value_size);
-  return iree_ok_status();
-}
-
-// Frees the semaphore and value list inside |semaphore_list|.
-static void iree_hal_hip_free_semaphore_list(
-    iree_allocator_t host_allocator,
-    iree_hal_semaphore_list_t* semaphore_list) {
-  iree_allocator_free(host_allocator, semaphore_list->semaphores);
-  iree_allocator_free(host_allocator, semaphore_list->payload_values);
-}
-
 static void iree_hal_hip_queue_action_destroy(
     iree_hal_hip_queue_action_t* action) {
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -497,10 +446,6 @@
 
   // Only release resources after callbacks have been issued.
   iree_hal_resource_set_free(action->resource_set);
-  iree_hal_hip_free_semaphore_list(host_allocator,
-                                   &action->wait_semaphore_list);
-  iree_hal_hip_free_semaphore_list(host_allocator,
-                                   &action->signal_semaphore_list);
 
   iree_hal_hip_queue_action_clear_events(action);
 
@@ -511,7 +456,7 @@
   IREE_TRACE_ZONE_END(z0);
 }
 
-static void decrement_work_items_count(
+static void iree_hal_hip_queue_decrement_work_items_count(
     iree_hal_hip_working_area_t* working_area) {
   iree_slim_mutex_lock(&working_area->pending_work_items_count_mutex);
   --working_area->pending_work_items_count;
@@ -532,15 +477,36 @@
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers) {
+    iree_hal_command_buffer_t* const* command_buffers,
+    iree_hal_buffer_binding_table_t const* binding_tables) {
   IREE_ASSERT_ARGUMENT(actions);
   IREE_ASSERT_ARGUMENT(command_buffer_count == 0 || command_buffers);
   IREE_TRACE_ZONE_BEGIN(z0);
 
+  // Embed captured tables in the action allocation.
   iree_hal_hip_queue_action_t* action = NULL;
+  const iree_host_size_t wait_semaphore_list_size =
+      wait_semaphore_list.count * sizeof(*wait_semaphore_list.semaphores) +
+      wait_semaphore_list.count * sizeof(*wait_semaphore_list.payload_values);
+  const iree_host_size_t signal_semaphore_list_size =
+      signal_semaphore_list.count * sizeof(*signal_semaphore_list.semaphores) +
+      signal_semaphore_list.count *
+          sizeof(*signal_semaphore_list.payload_values);
+  const iree_host_size_t command_buffers_size =
+      command_buffer_count * sizeof(*action->payload.execution.command_buffers);
+  const iree_host_size_t binding_tables_size =
+      binding_tables ? command_buffer_count *
+                           sizeof(*action->payload.execution.binding_tables)
+                     : 0;
+  const iree_host_size_t payload_size =
+      command_buffers_size + binding_tables_size;
+  const iree_host_size_t total_action_size =
+      sizeof(*action) + wait_semaphore_list_size + signal_semaphore_list_size +
+      payload_size;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_allocator_malloc(actions->host_allocator, sizeof(*action),
+      z0, iree_allocator_malloc(actions->host_allocator, total_action_size,
                                 (void**)&action));
+  uint8_t* action_ptr = (uint8_t*)action + sizeof(*action);
 
   action->owning_actions = actions;
   action->state = IREE_HAL_HIP_QUEUE_ACTION_STATE_ALIVE;
@@ -555,51 +521,67 @@
   action->event_count = 0;
   action->is_pending = true;
 
+  // Copy wait list for later access.
+  action->wait_semaphore_list.count = wait_semaphore_list.count;
+  action->wait_semaphore_list.semaphores = (iree_hal_semaphore_t**)action_ptr;
+  memcpy(action->wait_semaphore_list.semaphores, wait_semaphore_list.semaphores,
+         wait_semaphore_list.count * sizeof(*wait_semaphore_list.semaphores));
+  action->wait_semaphore_list.payload_values =
+      (uint64_t*)(action_ptr + wait_semaphore_list.count *
+                                   sizeof(*wait_semaphore_list.semaphores));
+  memcpy(
+      action->wait_semaphore_list.payload_values,
+      wait_semaphore_list.payload_values,
+      wait_semaphore_list.count * sizeof(*wait_semaphore_list.payload_values));
+  action_ptr += wait_semaphore_list_size;
+
+  // Copy signal list for later access.
+  action->signal_semaphore_list.count = signal_semaphore_list.count;
+  action->signal_semaphore_list.semaphores = (iree_hal_semaphore_t**)action_ptr;
+  memcpy(
+      action->signal_semaphore_list.semaphores,
+      signal_semaphore_list.semaphores,
+      signal_semaphore_list.count * sizeof(*signal_semaphore_list.semaphores));
+  action->signal_semaphore_list.payload_values =
+      (uint64_t*)(action_ptr + signal_semaphore_list.count *
+                                   sizeof(*signal_semaphore_list.semaphores));
+  memcpy(action->signal_semaphore_list.payload_values,
+         signal_semaphore_list.payload_values,
+         signal_semaphore_list.count *
+             sizeof(*signal_semaphore_list.payload_values));
+  action_ptr += signal_semaphore_list_size;
+
+  // Copy the execution resources for later access.
+  action->payload.execution.count = command_buffer_count;
+  action->payload.execution.command_buffers =
+      (iree_hal_command_buffer_t**)action_ptr;
+  memcpy(action->payload.execution.command_buffers, command_buffers,
+         command_buffers_size);
+  action_ptr += command_buffers_size;
+  action->payload.execution.binding_tables =
+      (iree_hal_buffer_binding_table_t*)action_ptr;
+  memcpy(action->payload.execution.binding_tables, binding_tables,
+         binding_tables_size);
+  action_ptr += binding_tables_size;
+
   // Retain all command buffers and semaphores.
-  iree_hal_resource_set_t* resource_set = NULL;
-  iree_status_t status =
-      iree_hal_resource_set_allocate(actions->block_pool, &resource_set);
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status = iree_hal_resource_set_insert(resource_set, command_buffer_count,
-                                          command_buffers);
+  iree_status_t status = iree_hal_resource_set_allocate(actions->block_pool,
+                                                        &action->resource_set);
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_resource_set_insert(action->resource_set,
+                                          wait_semaphore_list.count,
+                                          wait_semaphore_list.semaphores);
   }
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status =
-        iree_hal_resource_set_insert(resource_set, wait_semaphore_list.count,
-                                     wait_semaphore_list.semaphores);
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_resource_set_insert(action->resource_set,
+                                          signal_semaphore_list.count,
+                                          signal_semaphore_list.semaphores);
   }
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status =
-        iree_hal_resource_set_insert(resource_set, signal_semaphore_list.count,
-                                     signal_semaphore_list.semaphores);
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_resource_set_insert(
+        action->resource_set, command_buffer_count, command_buffers);
   }
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    action->resource_set = resource_set;
-  }
-
-  // Copy the command buffer list for later access.
-  // TODO: avoid host allocator malloc; use some pool for the allocation.
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    action->payload.command_buffers.count = command_buffer_count;
-    status = iree_hal_hip_copy_command_buffer_list(
-        command_buffer_count, command_buffers, actions->host_allocator,
-        &action->payload.command_buffers.ptr);
-  }
-
-  // Copy the semaphore and value list for later access.
-  // TODO: avoid host allocator malloc; use some pool for the allocation.
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status = iree_hal_hip_copy_semaphore_list(wait_semaphore_list,
-                                              actions->host_allocator,
-                                              &action->wait_semaphore_list);
-  }
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
-    status = iree_hal_hip_copy_semaphore_list(signal_semaphore_list,
-                                              actions->host_allocator,
-                                              &action->signal_semaphore_list);
-  }
-
-  if (IREE_LIKELY(iree_status_is_ok(status))) {
+  if (iree_status_is_ok(status)) {
     // Retain the owning queue to make sure the action outlives it.
     iree_hal_resource_retain(actions);
 
@@ -608,13 +590,7 @@
     iree_hal_hip_queue_action_list_push_back(&actions->action_list, action);
     iree_slim_mutex_unlock(&actions->action_mutex);
   } else {
-    iree_hal_hip_free_semaphore_list(actions->host_allocator,
-                                     &action->wait_semaphore_list);
-    iree_hal_hip_free_semaphore_list(actions->host_allocator,
-                                     &action->signal_semaphore_list);
-    iree_hal_hip_free_command_buffer_list(actions->host_allocator,
-                                          action->payload.command_buffers.ptr);
-    iree_hal_resource_set_free(resource_set);
+    iree_hal_resource_set_free(action->resource_set);
     iree_allocator_free(actions->host_allocator, action);
   }
 
@@ -684,7 +660,7 @@
   }
 
   // The callback (work item) is complete.
-  decrement_work_items_count(&actions->working_area);
+  iree_hal_hip_queue_decrement_work_items_count(&actions->working_area);
 
   IREE_TRACE_ZONE_END(z0);
 }
@@ -712,21 +688,24 @@
   }
 
   // Then launch all command buffers to the dispatch stream.
-  IREE_TRACE_ZONE_BEGIN(dispatch_command_buffers);
-  IREE_TRACE_ZONE_APPEND_TEXT(dispatch_command_buffers,
-                              " dispatch_command_buffers",
-                              strlen(" dispatch_command_buffers"));
-  for (iree_host_size_t i = 0; i < action->payload.command_buffers.count; ++i) {
+  IREE_TRACE_ZONE_BEGIN(z_dispatch_command_buffers);
+  IREE_TRACE_ZONE_APPEND_TEXT(z_dispatch_command_buffers,
+                              "dispatch_command_buffers");
+  for (iree_host_size_t i = 0; i < action->payload.execution.count; ++i) {
     iree_hal_command_buffer_t* command_buffer =
-        action->payload.command_buffers.ptr[i];
+        action->payload.execution.command_buffers[i];
+    iree_hal_buffer_binding_table_t binding_table =
+        action->payload.execution.binding_tables
+            ? action->payload.execution.binding_tables[i]
+            : iree_hal_buffer_binding_table_empty();
     if (iree_hal_hip_stream_command_buffer_isa(command_buffer)) {
       // Nothing to do for an inline command buffer; all the work has already
       // been submitted. When we support semaphores we'll still need to signal
       // their completion but do not have to worry about any waits: if there
       // were waits we wouldn't have been able to execute inline!
     } else if (iree_hal_hip_graph_command_buffer_isa(command_buffer)) {
-      hipGraphExec_t exec = iree_hal_hip_graph_command_buffer_handle(
-          action->payload.command_buffers.ptr[i]);
+      hipGraphExec_t exec =
+          iree_hal_hip_graph_command_buffer_handle(command_buffer);
       IREE_HIP_RETURN_AND_END_ZONE_IF_ERROR(
           z0, symbols, hipGraphLaunch(exec, action->dispatch_hip_stream),
           "hipGraphLaunch");
@@ -741,19 +720,12 @@
                   action->device, mode, IREE_HAL_COMMAND_CATEGORY_ANY,
                   /*binding_capacity=*/0, &stream_command_buffer));
       IREE_RETURN_AND_END_ZONE_IF_ERROR(
-          z0, iree_hal_resource_set_insert(action->resource_set, 1,
-                                           &stream_command_buffer));
-      IREE_RETURN_AND_END_ZONE_IF_ERROR(
           z0, iree_hal_deferred_command_buffer_apply(
-                  command_buffer, stream_command_buffer,
-                  iree_hal_buffer_binding_table_empty()));
-      // The stream_command_buffer is going to be retained by
-      // the action->resource_set and deleted after the action
-      // completes.
+                  command_buffer, stream_command_buffer, binding_table));
       iree_hal_resource_release(stream_command_buffer);
     }
   }
-  IREE_TRACE_ZONE_END(dispatch_command_buffers);
+  IREE_TRACE_ZONE_END(z_dispatch_command_buffers);
 
   // Last record hipEvent_t signals in the dispatch stream.
   for (iree_host_size_t i = 0; i < action->signal_semaphore_list.count; ++i) {
@@ -806,7 +778,7 @@
 
   // Now we fully executed and cleaned up this action. Decrease the work items
   // counter.
-  decrement_work_items_count(&actions->working_area);
+  iree_hal_hip_queue_decrement_work_items_count(&actions->working_area);
 
   IREE_TRACE_ZONE_END(z0);
 }
diff --git a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.h b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.h
index 1b85336..25df62b 100644
--- a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.h
+++ b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.h
@@ -65,7 +65,8 @@
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers);
+    iree_hal_command_buffer_t* const* command_buffers,
+    iree_hal_buffer_binding_table_t const* binding_tables);
 
 // Tries to scan the pending actions and release ready ones to the GPU.
 iree_status_t iree_hal_hip_pending_queue_actions_issue(
diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
index 9dc7786..cedca50 100644
--- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
@@ -83,13 +83,17 @@
 
   iree_hal_hip_stream_command_buffer_t* command_buffer = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_allocator_malloc(host_allocator, sizeof(*command_buffer),
-                                (void**)&command_buffer));
+      z0,
+      iree_allocator_malloc(host_allocator,
+                            sizeof(*command_buffer) +
+                                iree_hal_command_buffer_validation_state_size(
+                                    mode, binding_capacity),
+                            (void**)&command_buffer));
 
   iree_hal_command_buffer_initialize(
       device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
-      binding_capacity, &iree_hal_hip_stream_command_buffer_vtable,
-      &command_buffer->base);
+      binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
+      &iree_hal_hip_stream_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
   command_buffer->hip_symbols = hip_symbols;
   command_buffer->nccl_symbols = nccl_symbols;
@@ -281,7 +285,8 @@
 }
 
 static iree_status_t iree_hal_hip_stream_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   // We could mark the memory as invalidated so that if managed HIP does not
   // try to copy it back to the host.
   return iree_ok_status();
@@ -289,8 +294,7 @@
 
 static iree_status_t iree_hal_hip_stream_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   iree_hal_hip_stream_command_buffer_t* command_buffer =
       iree_hal_hip_stream_command_buffer_cast(base_command_buffer);
@@ -300,10 +304,11 @@
       z0, iree_hal_hip_stream_command_buffer_flush_collectives(command_buffer));
 
   hipDeviceptr_t target_device_buffer = iree_hal_hip_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
   hipDeviceptr_t dst = (uint8_t*)target_device_buffer + target_offset;
-  size_t num_elements = length / pattern_length;
+  size_t num_elements = target_ref.length / pattern_length;
 
   switch (pattern_length) {
     case 4: {
@@ -342,8 +347,7 @@
 
 static iree_status_t iree_hal_hip_stream_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_hip_stream_command_buffer_t* command_buffer =
       iree_hal_hip_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -361,21 +365,22 @@
   if (command_buffer->arena.block_pool) {
     uint8_t* storage = NULL;
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0,
-        iree_arena_allocate(&command_buffer->arena, length, (void**)&storage));
-    memcpy(storage, src, length);
+        z0, iree_arena_allocate(&command_buffer->arena, target_ref.length,
+                                (void**)&storage));
+    memcpy(storage, src, target_ref.length);
     src = storage;
   }
 
   // Issue the copy using the scratch memory as the source.
   hipDeviceptr_t target_device_buffer = iree_hal_hip_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
   hipDeviceptr_t dst = (uint8_t*)target_device_buffer +
-                       iree_hal_buffer_byte_offset(target_buffer) +
-                       target_offset;
+                       iree_hal_buffer_byte_offset(target_ref.buffer) +
+                       target_ref.offset;
   IREE_HIP_RETURN_AND_END_ZONE_IF_ERROR(
       z0, command_buffer->hip_symbols,
-      hipMemcpyHtoDAsync(dst, (void*)src, length, command_buffer->hip_stream),
+      hipMemcpyHtoDAsync(dst, (void*)src, target_ref.length,
+                         command_buffer->hip_stream),
       "hipMemcpyHtoDAsync");
 
   IREE_TRACE_ZONE_END(z0);
@@ -384,9 +389,7 @@
 
 static iree_status_t iree_hal_hip_stream_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   iree_hal_hip_stream_command_buffer_t* command_buffer =
       iree_hal_hip_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -395,17 +398,19 @@
       z0, iree_hal_hip_stream_command_buffer_flush_collectives(command_buffer));
 
   hipDeviceptr_t target_device_buffer = iree_hal_hip_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
   hipDeviceptr_t source_device_buffer = iree_hal_hip_buffer_device_pointer(
-      iree_hal_buffer_allocated_buffer(source_buffer));
-  source_offset += iree_hal_buffer_byte_offset(source_buffer);
+      iree_hal_buffer_allocated_buffer(source_ref.buffer));
+  iree_device_size_t source_offset =
+      iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset;
   hipDeviceptr_t dst = (uint8_t*)target_device_buffer + target_offset;
   hipDeviceptr_t src = (uint8_t*)source_device_buffer + source_offset;
 
   IREE_HIP_RETURN_AND_END_ZONE_IF_ERROR(
       z0, command_buffer->hip_symbols,
-      hipMemcpyAsync(dst, src, length, hipMemcpyDeviceToDevice,
+      hipMemcpyAsync(dst, src, target_ref.length, hipMemcpyDeviceToDevice,
                      command_buffer->hip_stream),
       "hipMemcpyAsync");
 
@@ -415,13 +420,22 @@
 
 static iree_status_t iree_hal_hip_stream_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   iree_hal_hip_stream_command_buffer_t* command_buffer =
       iree_hal_hip_stream_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
+  iree_hal_buffer_binding_t send_binding = {
+      .buffer = send_ref.buffer,
+      .offset = send_ref.offset,
+      .length = send_ref.length,
+  };
+  iree_hal_buffer_binding_t recv_binding = {
+      .buffer = recv_ref.buffer,
+      .offset = recv_ref.offset,
+      .length = recv_ref.length,
+  };
   iree_status_t status = iree_hal_collective_batch_append(
       &command_buffer->collective_batch, channel, op, param, send_binding,
       recv_binding, element_count);
@@ -451,8 +465,7 @@
 static iree_status_t iree_hal_hip_stream_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   if (binding_count > IREE_HAL_HIP_MAX_DESCRIPTOR_SET_BINDING_COUNT) {
     return iree_make_status(
         IREE_STATUS_RESOURCE_EXHAUSTED,
@@ -468,7 +481,7 @@
   hipDeviceptr_t* current_bindings =
       command_buffer->descriptor_sets[set].bindings;
   for (iree_host_size_t i = 0; i < binding_count; i++) {
-    const iree_hal_descriptor_set_binding_t* binding = &bindings[i];
+    const iree_hal_buffer_ref_t* binding = &bindings[i];
     hipDeviceptr_t device_ptr = NULL;
     if (binding->buffer) {
       IREE_RETURN_AND_END_ZONE_IF_ERROR(
@@ -480,7 +493,7 @@
       iree_device_size_t offset = iree_hal_buffer_byte_offset(binding->buffer);
       device_ptr = (uint8_t*)device_buffer + offset + binding->offset;
     }
-    current_bindings[binding->binding] = device_ptr;
+    current_bindings[binding->ordinal] = device_ptr;
   }
 
   IREE_TRACE_ZONE_END(z0);
@@ -590,8 +603,7 @@
 static iree_status_t iree_hal_hip_stream_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "need hip implementation of dispatch indirect");
 }
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_device.c b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
index f2cc6df..7e6323a 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_device.c
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
@@ -339,49 +339,6 @@
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_sync_device_apply_deferred_command_buffers(
-    iree_hal_sync_device_t* device, iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers) {
-  // See if there are any deferred command buffers; this saves us work in cases
-  // of pure inline execution.
-  bool any_deferred = false;
-  for (iree_host_size_t i = 0; i < command_buffer_count && !any_deferred; ++i) {
-    any_deferred = iree_hal_deferred_command_buffer_isa(command_buffers[i]);
-  }
-  if (!any_deferred) return iree_ok_status();
-
-  // Stack allocate storage for an inline command buffer we'll use to replay
-  // the deferred command buffers. We want to reset it between each apply so
-  // that we don't get state carrying across.
-  iree_byte_span_t storage =
-      iree_make_byte_span(iree_alloca(iree_hal_inline_command_buffer_size()),
-                          iree_hal_inline_command_buffer_size());
-
-  // NOTE: we ignore any inline command buffers that may be passed in as they've
-  // already executed during recording. The caller is probably in for a bad time
-  // if they mixed the two modes together!
-  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
-    iree_hal_command_buffer_t* command_buffer = command_buffers[i];
-    if (iree_hal_deferred_command_buffer_isa(command_buffer)) {
-      iree_hal_command_buffer_t* inline_command_buffer = NULL;
-      IREE_RETURN_IF_ERROR(iree_hal_inline_command_buffer_initialize(
-          (iree_hal_device_t*)device,
-          iree_hal_command_buffer_mode(command_buffer) |
-              IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION,
-          IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY,
-          /*binding_capacity=*/0, device->host_allocator, storage,
-          &inline_command_buffer));
-      iree_status_t status = iree_hal_deferred_command_buffer_apply(
-          command_buffer, inline_command_buffer,
-          iree_hal_buffer_binding_table_empty());
-      iree_hal_inline_command_buffer_deinitialize(inline_command_buffer);
-      IREE_RETURN_IF_ERROR(status);
-    }
-  }
-
-  return iree_ok_status();
-}
-
 static iree_status_t iree_hal_sync_device_queue_read(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -424,6 +381,57 @@
   return loop_status;
 }
 
+static iree_status_t iree_hal_sync_device_apply_deferred_command_buffers(
+    iree_hal_sync_device_t* device, iree_host_size_t command_buffer_count,
+    iree_hal_command_buffer_t* const* command_buffers,
+    iree_hal_buffer_binding_table_t const* binding_tables) {
+  // See if there are any deferred command buffers; this saves us work in cases
+  // of pure inline execution.
+  bool any_deferred = false;
+  for (iree_host_size_t i = 0; i < command_buffer_count && !any_deferred; ++i) {
+    any_deferred = iree_hal_deferred_command_buffer_isa(command_buffers[i]);
+  }
+  if (!any_deferred) return iree_ok_status();
+
+  // Stack allocate storage for an inline command buffer we'll use to replay
+  // the deferred command buffers. We want to reset it between each apply so
+  // that we don't get state carrying across.
+  iree_host_size_t storage_size = iree_hal_inline_command_buffer_size(
+      IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED,
+      /*binding_capacity=*/0);
+  iree_byte_span_t storage =
+      iree_make_byte_span(iree_alloca(storage_size), storage_size);
+
+  // NOTE: we ignore any inline command buffers that may be passed in as they've
+  // already executed during recording. The caller is probably in for a bad time
+  // if they mixed the two modes together!
+  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
+    iree_hal_command_buffer_t* command_buffer = command_buffers[i];
+    iree_hal_buffer_binding_table_t binding_table =
+        binding_tables ? binding_tables[i]
+                       : iree_hal_buffer_binding_table_empty();
+    if (iree_hal_deferred_command_buffer_isa(command_buffer)) {
+      // NOTE: we run unvalidated as inline command buffers don't support
+      // binding tables and can be validated entirely while recording.
+      iree_hal_command_buffer_t* inline_command_buffer = NULL;
+      IREE_RETURN_IF_ERROR(iree_hal_inline_command_buffer_initialize(
+          (iree_hal_device_t*)device,
+          iree_hal_command_buffer_mode(command_buffer) |
+              IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION |
+              IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED,
+          IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY,
+          /*binding_capacity=*/0, device->host_allocator, storage,
+          &inline_command_buffer));
+      iree_status_t status = iree_hal_deferred_command_buffer_apply(
+          command_buffer, inline_command_buffer, binding_table);
+      iree_hal_inline_command_buffer_deinitialize(inline_command_buffer);
+      IREE_RETURN_IF_ERROR(status);
+    }
+  }
+
+  return iree_ok_status();
+}
+
 static iree_status_t iree_hal_sync_device_queue_execute(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -446,7 +454,7 @@
   // Run all deferred command buffers - any we could have run inline we already
   // did during recording.
   IREE_RETURN_IF_ERROR(iree_hal_sync_device_apply_deferred_command_buffers(
-      device, command_buffer_count, command_buffers));
+      device, command_buffer_count, command_buffers, binding_tables));
 
   // Signal all semaphores now that batch work has completed.
   IREE_RETURN_IF_ERROR(iree_hal_sync_semaphore_multi_signal(
diff --git a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
index d065cac..b513c54 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
@@ -138,10 +138,14 @@
 
   iree_hal_task_command_buffer_t* command_buffer = NULL;
   iree_status_t status = iree_allocator_malloc(
-      host_allocator, sizeof(*command_buffer), (void**)&command_buffer);
+      host_allocator,
+      sizeof(*command_buffer) +
+          iree_hal_command_buffer_validation_state_size(mode, binding_capacity),
+      (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
         device, mode, command_categories, queue_affinity, binding_capacity,
+        (uint8_t*)command_buffer + sizeof(*command_buffer),
         &iree_hal_task_command_buffer_vtable, &command_buffer->base);
     command_buffer->host_allocator = host_allocator;
     command_buffer->scope = scope;
@@ -472,7 +476,8 @@
 //===----------------------------------------------------------------------===//
 
 static iree_status_t iree_hal_task_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   return iree_ok_status();
 }
 
@@ -490,9 +495,7 @@
 
 typedef struct iree_hal_cmd_fill_buffer_t {
   iree_task_dispatch_t task;
-  iree_hal_buffer_t* target_buffer;
-  iree_device_size_t target_offset;
-  iree_device_size_t length;
+  iree_hal_buffer_ref_t target_ref;
   uint32_t pattern_length;
   uint8_t pattern[8];
 } iree_hal_cmd_fill_buffer_t;
@@ -507,14 +510,14 @@
   iree_device_size_t length_per_slice = tile_context->workgroup_size[0];
   iree_device_size_t slice_offset =
       (iree_device_size_t)tile_context->workgroup_xyz[0] * length_per_slice;
-  iree_device_size_t remaining_length = cmd->length - slice_offset;
+  iree_device_size_t remaining_length = cmd->target_ref.length - slice_offset;
   iree_device_size_t slice_length =
       iree_min(length_per_slice, remaining_length);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (uint64_t)slice_length);
 
   iree_status_t status = iree_hal_buffer_map_fill(
-      cmd->target_buffer, cmd->target_offset + slice_offset, slice_length,
-      cmd->pattern, cmd->pattern_length);
+      cmd->target_ref.buffer, cmd->target_ref.offset + slice_offset,
+      slice_length, cmd->pattern, cmd->pattern_length);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
@@ -522,14 +525,13 @@
 
 static iree_status_t iree_hal_task_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   iree_hal_task_command_buffer_t* command_buffer =
       iree_hal_task_command_buffer_cast(base_command_buffer);
 
   IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
-      command_buffer->resource_set, 1, &target_buffer));
+      command_buffer->resource_set, 1, &target_ref.buffer));
 
   iree_hal_cmd_fill_buffer_t* cmd = NULL;
   IREE_RETURN_IF_ERROR(
@@ -541,7 +543,7 @@
       /*z=*/1,
   };
   const uint32_t workgroup_count[3] = {
-      /*x=*/iree_device_size_ceil_div(length, workgroup_size[0]),
+      /*x=*/iree_device_size_ceil_div(target_ref.length, workgroup_size[0]),
       /*y=*/1,
       /*z=*/1,
   };
@@ -549,9 +551,7 @@
       command_buffer->scope,
       iree_task_make_dispatch_closure(iree_hal_cmd_fill_tile, (void*)cmd),
       workgroup_size, workgroup_count, &cmd->task);
-  cmd->target_buffer = target_buffer;
-  cmd->target_offset = target_offset;
-  cmd->length = length;
+  cmd->target_ref = target_ref;
   memcpy(cmd->pattern, pattern, pattern_length);
   cmd->pattern_length = pattern_length;
 
@@ -565,9 +565,7 @@
 
 typedef struct iree_hal_cmd_update_buffer_t {
   iree_task_call_t task;
-  iree_hal_buffer_t* target_buffer;
-  iree_device_size_t target_offset;
-  iree_device_size_t length;
+  iree_hal_buffer_ref_t target_ref;
   uint8_t source_buffer[];
 } iree_hal_cmd_update_buffer_t;
 
@@ -577,24 +575,24 @@
   const iree_hal_cmd_update_buffer_t* cmd =
       (const iree_hal_cmd_update_buffer_t*)user_context;
   IREE_TRACE_ZONE_BEGIN(z0);
-  iree_status_t status = iree_hal_buffer_map_write(
-      cmd->target_buffer, cmd->target_offset, cmd->source_buffer, cmd->length);
+  iree_status_t status =
+      iree_hal_buffer_map_write(cmd->target_ref.buffer, cmd->target_ref.offset,
+                                cmd->source_buffer, cmd->target_ref.length);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 static iree_status_t iree_hal_task_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_task_command_buffer_t* command_buffer =
       iree_hal_task_command_buffer_cast(base_command_buffer);
 
   IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
-      command_buffer->resource_set, 1, &target_buffer));
+      command_buffer->resource_set, 1, &target_ref.buffer));
 
   iree_host_size_t total_cmd_size =
-      sizeof(iree_hal_cmd_update_buffer_t) + length;
+      sizeof(iree_hal_cmd_update_buffer_t) + target_ref.length;
 
   iree_hal_cmd_update_buffer_t* cmd = NULL;
   IREE_RETURN_IF_ERROR(iree_arena_allocate(&command_buffer->arena,
@@ -604,12 +602,9 @@
       command_buffer->scope,
       iree_task_make_call_closure(iree_hal_cmd_update_buffer, (void*)cmd),
       &cmd->task);
-  cmd->target_buffer = target_buffer;
-  cmd->target_offset = target_offset;
-  cmd->length = length;
-
+  cmd->target_ref = target_ref;
   memcpy(cmd->source_buffer, (const uint8_t*)source_buffer + source_offset,
-         cmd->length);
+         cmd->target_ref.length);
 
   return iree_hal_task_command_buffer_emit_execution_task(command_buffer,
                                                           &cmd->task.header);
@@ -629,11 +624,8 @@
 
 typedef struct iree_hal_cmd_copy_buffer_t {
   iree_task_dispatch_t task;
-  iree_hal_buffer_t* source_buffer;
-  iree_device_size_t source_offset;
-  iree_hal_buffer_t* target_buffer;
-  iree_device_size_t target_offset;
-  iree_device_size_t length;
+  iree_hal_buffer_ref_t source_ref;
+  iree_hal_buffer_ref_t target_ref;
 } iree_hal_cmd_copy_buffer_t;
 
 static iree_status_t iree_hal_cmd_copy_tile(
@@ -646,14 +638,15 @@
   iree_device_size_t length_per_slice = tile_context->workgroup_size[0];
   iree_device_size_t slice_offset =
       (iree_device_size_t)tile_context->workgroup_xyz[0] * length_per_slice;
-  iree_device_size_t remaining_length = cmd->length - slice_offset;
+  iree_device_size_t remaining_length = cmd->target_ref.length - slice_offset;
   iree_device_size_t slice_length =
       iree_min(length_per_slice, remaining_length);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (uint64_t)slice_length);
 
   iree_status_t status = iree_hal_buffer_map_copy(
-      cmd->source_buffer, cmd->source_offset + slice_offset, cmd->target_buffer,
-      cmd->target_offset + slice_offset, slice_length);
+      cmd->source_ref.buffer, cmd->source_ref.offset + slice_offset,
+      cmd->target_ref.buffer, cmd->target_ref.offset + slice_offset,
+      slice_length);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
@@ -661,15 +654,16 @@
 
 static iree_status_t iree_hal_task_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   iree_hal_task_command_buffer_t* command_buffer =
       iree_hal_task_command_buffer_cast(base_command_buffer);
 
-  const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
-  IREE_RETURN_IF_ERROR(
-      iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+  const iree_hal_buffer_t* buffers[2] = {
+      source_ref.buffer,
+      target_ref.buffer,
+  };
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, IREE_ARRAYSIZE(buffers), buffers));
 
   iree_hal_cmd_copy_buffer_t* cmd = NULL;
   IREE_RETURN_IF_ERROR(
@@ -681,7 +675,7 @@
       /*z=*/1,
   };
   const uint32_t workgroup_count[3] = {
-      /*x=*/iree_device_size_ceil_div(length, workgroup_size[0]),
+      /*x=*/iree_device_size_ceil_div(target_ref.length, workgroup_size[0]),
       /*y=*/1,
       /*z=*/1,
   };
@@ -689,11 +683,8 @@
       command_buffer->scope,
       iree_task_make_dispatch_closure(iree_hal_cmd_copy_tile, (void*)cmd),
       workgroup_size, workgroup_count, &cmd->task);
-  cmd->source_buffer = source_buffer;
-  cmd->source_offset = source_offset;
-  cmd->target_buffer = target_buffer;
-  cmd->target_offset = target_offset;
-  cmd->length = length;
+  cmd->source_ref = source_ref;
+  cmd->target_ref = target_ref;
 
   return iree_hal_task_command_buffer_emit_execution_task(command_buffer,
                                                           &cmd->task.header);
@@ -705,9 +696,8 @@
 
 static iree_status_t iree_hal_task_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   // The channel can be used as a vtable if we want to inject collective APIs -
   // the device creation function would set up the channel once and we'll
   // receive it here each time. When interacting with the task system we want to
@@ -764,8 +754,7 @@
 static iree_status_t iree_hal_task_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   iree_hal_task_command_buffer_t* command_buffer =
       iree_hal_task_command_buffer_cast(base_command_buffer);
 
@@ -777,12 +766,12 @@
   iree_host_size_t binding_base =
       set * IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT;
   for (iree_host_size_t i = 0; i < binding_count; ++i) {
-    if (IREE_UNLIKELY(bindings[i].binding >=
+    if (IREE_UNLIKELY(bindings[i].ordinal >=
                       IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT)) {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "buffer binding index out of bounds");
     }
-    iree_host_size_t binding_ordinal = binding_base + bindings[i].binding;
+    iree_host_size_t binding_ordinal = binding_base + bindings[i].ordinal;
 
     // TODO(benvanik): batch insert by getting the resources in their own list.
     IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
@@ -822,7 +811,7 @@
   uint16_t push_constant_count;
 
   // Total number of binding base pointers in |binding_ptrs| and
-  // |binding_lengths|. The set is packed densely based on which binidngs are
+  // |binding_lengths|. The set is packed densely based on which bindings are
   // used (known at compile-time).
   uint16_t binding_count;
 
@@ -998,20 +987,19 @@
 static iree_status_t iree_hal_task_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   iree_hal_task_command_buffer_t* command_buffer =
       iree_hal_task_command_buffer_cast(base_command_buffer);
 
-  const void* resources[2] = {executable, workgroups_buffer};
+  const void* resources[2] = {executable, workgroups_ref.buffer};
   IREE_RETURN_IF_ERROR(
       iree_hal_resource_set_insert(command_buffer->resource_set, 2, resources));
 
   // TODO(benvanik): track mapping so we can properly map/unmap/flush/etc.
   iree_hal_buffer_mapping_t buffer_mapping = {{0}};
   IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
-      workgroups_buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
-      IREE_HAL_MEMORY_ACCESS_READ, workgroups_offset, 3 * sizeof(uint32_t),
+      workgroups_ref.buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
+      IREE_HAL_MEMORY_ACCESS_READ, workgroups_ref.offset, 3 * sizeof(uint32_t),
       &buffer_mapping));
 
   iree_hal_cmd_dispatch_t* cmd = NULL;
diff --git a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
index 4a2a07a..b2e343d 100644
--- a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
+++ b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
@@ -353,11 +353,16 @@
 
   iree_hal_metal_command_buffer_t* command_buffer = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_allocator_malloc(host_allocator, sizeof(*command_buffer), (void**)&command_buffer));
+      z0,
+      iree_allocator_malloc(host_allocator,
+                            sizeof(*command_buffer) + iree_hal_command_buffer_validation_state_size(
+                                                          mode, binding_capacity),
+                            (void**)&command_buffer));
 
   iree_hal_command_buffer_initialize(device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
-                                     binding_capacity, &iree_hal_metal_command_buffer_vtable,
-                                     &command_buffer->base);
+                                     binding_capacity,
+                                     (uint8_t*)command_buffer + sizeof(*command_buffer),
+                                     &iree_hal_metal_command_buffer_vtable, &command_buffer->base);
   command_buffer->device = device;
   command_buffer->queue = [queue retain];  // +1
   command_buffer->builtin_executable = builtin_executable;
@@ -564,7 +569,7 @@
 }
 
 static iree_status_t iree_hal_metal_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_ref_t buffer_ref) {
   // This is a hint to the device and we have nothing to do for Metal.
   return iree_ok_status();
 }
@@ -629,16 +634,16 @@
 }
 
 static iree_status_t iree_hal_metal_command_buffer_prepare_fill_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length, const void* pattern,
-    iree_host_size_t pattern_length) {
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_ref_t target_ref,
+    const void* pattern, iree_host_size_t pattern_length) {
   iree_hal_metal_command_buffer_t* command_buffer =
       iree_hal_metal_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   id<MTLBuffer> target_device_buffer =
-      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
 
   // Allocate the command segment and keep track of all necessary API data.
   uint8_t* storage_base = NULL;
@@ -658,7 +663,7 @@
 
   segment->fill_buffer.target_buffer = target_device_buffer;
   segment->fill_buffer.target_offset = target_offset;
-  segment->fill_buffer.length = length;
+  segment->fill_buffer.length = target_ref.length;
   segment->fill_buffer.pattern = (const void*)pattern_ptr;
   segment->fill_buffer.pattern_length = pattern_length;
 
@@ -774,8 +779,7 @@
 
 static iree_status_t iree_hal_metal_command_buffer_prepare_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_metal_command_buffer_t* command_buffer =
       iree_hal_metal_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -784,50 +788,53 @@
   // staging buffer and then copy over.
 
   iree_const_byte_span_t source_data_span =
-      iree_make_const_byte_span((uint8_t*)source_buffer + source_offset, length);
+      iree_make_const_byte_span((uint8_t*)source_buffer + source_offset, target_ref.length);
   uint32_t offset = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_metal_staging_buffer_append(command_buffer->staging_buffer, source_data_span,
                                                /*alignment=*/4, &offset));
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, &target_buffer));
+      z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, &target_ref.buffer));
 
   id<MTLBuffer> target_device_buffer =
-      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(target_buffer));
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(target_ref.buffer));
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
 
   iree_status_t status = iree_hal_metal_command_segment_create_copy_buffer(
       command_buffer, command_buffer->staging_buffer->metal_buffer, offset, target_device_buffer,
-      target_offset, length);
+      target_offset, target_ref.length);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 static iree_status_t iree_hal_metal_command_buffer_prepare_copy_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* source_buffer,
-    iree_device_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_ref_t source_ref,
+    iree_hal_buffer_ref_t target_ref) {
   iree_hal_metal_command_buffer_t* command_buffer =
       iree_hal_metal_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+  const iree_hal_buffer_t* resources[2] = {source_ref.buffer, target_ref.buffer};
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+      z0, iree_hal_resource_set_insert(command_buffer->resource_set, IREE_ARRAYSIZE(resources),
+                                       resources));
 
   id<MTLBuffer> source_device_buffer =
-      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(source_buffer));
+      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(source_ref.buffer));
   id<MTLBuffer> target_device_buffer =
-      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(target_buffer));
+      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(target_ref.buffer));
 
-  source_offset += iree_hal_buffer_byte_offset(source_buffer);
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+  iree_device_size_t source_offset =
+      iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset;
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
 
   iree_status_t status = iree_hal_metal_command_segment_create_copy_buffer(
       command_buffer, source_device_buffer, source_offset, target_device_buffer, target_offset,
-      length);
+      target_ref.length);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
@@ -835,8 +842,8 @@
 
 static iree_status_t iree_hal_metal_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "collectives not yet supported");
 }
 
@@ -870,8 +877,7 @@
 
 static iree_status_t iree_hal_metal_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_pipeline_layout_t* pipeline_layout,
-    uint32_t set, iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    uint32_t set, iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   iree_hal_metal_command_buffer_t* command_buffer =
       iree_hal_metal_command_buffer_cast(base_command_buffer);
 
@@ -894,7 +900,7 @@
     iree_hal_metal_descriptor_t* descriptor = &descriptors[i];
 
     descriptor->set = set;
-    descriptor->binding = bindings[i].binding;
+    descriptor->binding = bindings[i].ordinal;
     descriptor->buffer = bindings[i].buffer;
     descriptor->offset = bindings[i].offset;
 
@@ -1084,7 +1090,7 @@
 
 static iree_status_t iree_hal_metal_command_buffer_prepare_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_executable_t* executable,
-    int32_t entry_point, iree_hal_buffer_t* workgroups_buffer,
+    int32_t entry_point, iree_hal_buffer_ref_t workgroups_ref,
     iree_device_size_t workgroups_offset) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -1093,8 +1099,8 @@
       z0, iree_hal_metal_command_segment_create_dispatch(base_command_buffer, executable,
                                                          entry_point, &segment));
   segment->workgroups_buffer =
-      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(workgroups_buffer));
-  segment->workgroups_offset = workgroups_offset;
+      iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(workgroups_ref.buffer));
+  segment->workgroups_offset = workgroups_ref.offset;
 
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
diff --git a/runtime/src/iree/hal/drivers/metal/metal_device.m b/runtime/src/iree/hal/drivers/metal/metal_device.m
index e16a883..620ff57 100644
--- a/runtime/src/iree/hal/drivers/metal/metal_device.m
+++ b/runtime/src/iree/hal/drivers/metal/metal_device.m
@@ -247,9 +247,13 @@
     iree_host_size_t binding_capacity, iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_metal_device_t* device = iree_hal_metal_device_cast(base_device);
 
-  if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT))
+  if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) {
     return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                             "multi-shot command buffer not yet supported");
+  } else if (binding_capacity > 0) {
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "indirect command buffers not yet supported");
+  }
 
   return iree_hal_metal_direct_command_buffer_create(
       base_device, mode, command_categories, binding_capacity,
diff --git a/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel b/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel
index 3a9568a..ce5b68b 100644
--- a/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel
@@ -78,6 +78,7 @@
         "//runtime/src/iree/hal/drivers/vulkan/util:arena",
         "//runtime/src/iree/hal/drivers/vulkan/util:intrusive_list",
         "//runtime/src/iree/hal/drivers/vulkan/util:ref_ptr",
+        "//runtime/src/iree/hal/utils:deferred_command_buffer",
         "//runtime/src/iree/hal/utils:file_transfer",
         "//runtime/src/iree/hal/utils:memory_file",
         "//runtime/src/iree/hal/utils:resource_set",
diff --git a/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt b/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt
index 12cdf9f..76e376f 100644
--- a/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt
@@ -73,6 +73,7 @@
     iree::hal::drivers::vulkan::util::arena
     iree::hal::drivers::vulkan::util::intrusive_list
     iree::hal::drivers::vulkan::util::ref_ptr
+    iree::hal::utils::deferred_command_buffer
     iree::hal::utils::file_transfer
     iree::hal::utils::memory_file
     iree::hal::utils::resource_set
diff --git a/runtime/src/iree/hal/drivers/vulkan/builtin_executables.cc b/runtime/src/iree/hal/drivers/vulkan/builtin_executables.cc
index b4faf57..2291731 100644
--- a/runtime/src/iree/hal/drivers/vulkan/builtin_executables.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/builtin_executables.cc
@@ -159,8 +159,8 @@
                               pattern_length);
   }
 
-  iree_hal_descriptor_set_binding_t binding;
-  binding.binding = 0;
+  iree_hal_buffer_ref_t binding;
+  binding.ordinal = 0;
   binding.buffer = target_buffer;
   binding.offset = 0;
   binding.length = IREE_WHOLE_BUFFER;
diff --git a/runtime/src/iree/hal/drivers/vulkan/descriptor_set_arena.cc b/runtime/src/iree/hal/drivers/vulkan/descriptor_set_arena.cc
index f121036..44abc79 100644
--- a/runtime/src/iree/hal/drivers/vulkan/descriptor_set_arena.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/descriptor_set_arena.cc
@@ -23,9 +23,8 @@
 namespace {
 
 static void PopulateDescriptorSetWriteInfos(
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings, VkDescriptorSet dst_set,
-    Arena* arena, iree_host_size_t* out_info_count,
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings,
+    VkDescriptorSet dst_set, Arena* arena, iree_host_size_t* out_info_count,
     VkWriteDescriptorSet** out_infos) {
   arena->Reset();
   auto buffer_infos =
@@ -72,7 +71,7 @@
     write_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     write_info.pNext = nullptr;
     write_info.dstSet = dst_set;
-    write_info.dstBinding = binding.binding;
+    write_info.dstBinding = binding.ordinal;
     write_info.dstArrayElement = 0;
     write_info.descriptorCount = 1;
     write_info.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
@@ -103,7 +102,7 @@
 iree_status_t DescriptorSetArena::BindDescriptorSet(
     VkCommandBuffer command_buffer, iree_hal_pipeline_layout_t* pipeline_layout,
     uint32_t set, iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    const iree_hal_buffer_ref_t* bindings) {
   // Always prefer using push descriptors when available as we can avoid the
   // additional API overhead of updating/resetting pools.
   if (logical_device_->enabled_extensions().push_descriptors) {
@@ -201,7 +200,7 @@
 void DescriptorSetArena::PushDescriptorSet(
     VkCommandBuffer command_buffer, iree_hal_pipeline_layout_t* pipeline_layout,
     uint32_t set, iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    const iree_hal_buffer_ref_t* bindings) {
   IREE_TRACE_SCOPE_NAMED("DescriptorSetArena::PushDescriptorSet");
   VkPipelineLayout device_pipeline_layout =
       iree_hal_vulkan_native_pipeline_layout_handle(pipeline_layout);
diff --git a/runtime/src/iree/hal/drivers/vulkan/descriptor_set_arena.h b/runtime/src/iree/hal/drivers/vulkan/descriptor_set_arena.h
index c07415b..6ae5807 100644
--- a/runtime/src/iree/hal/drivers/vulkan/descriptor_set_arena.h
+++ b/runtime/src/iree/hal/drivers/vulkan/descriptor_set_arena.h
@@ -34,11 +34,10 @@
   // Allocates and binds a descriptor set from the arena.
   // The command buffer will have the descriptor set containing |bindings| bound
   // to it.
-  iree_status_t BindDescriptorSet(
-      VkCommandBuffer command_buffer,
-      iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-      iree_host_size_t binding_count,
-      const iree_hal_descriptor_set_binding_t* bindings);
+  iree_status_t BindDescriptorSet(VkCommandBuffer command_buffer,
+                                  iree_hal_pipeline_layout_t* pipeline_layout,
+                                  uint32_t set, iree_host_size_t binding_count,
+                                  const iree_hal_buffer_ref_t* bindings);
 
   // Flushes all pending writes to descriptor sets allocated from the arena and
   // returns a group that - when dropped - will release the descriptor sets
@@ -52,7 +51,7 @@
   void PushDescriptorSet(VkCommandBuffer command_buffer,
                          iree_hal_pipeline_layout_t* pipeline_layout,
                          uint32_t set, iree_host_size_t binding_count,
-                         const iree_hal_descriptor_set_binding_t* bindings);
+                         const iree_hal_buffer_ref_t* bindings);
 
   VkDeviceHandle* logical_device_;
   DescriptorPoolCache* descriptor_pool_cache_;
diff --git a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
index 000584a..d956347 100644
--- a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
@@ -112,12 +112,15 @@
       z0, command_pool->Allocate(&allocate_info, &handle));
 
   iree_hal_vulkan_direct_command_buffer_t* command_buffer = NULL;
-  iree_status_t status =
-      iree_allocator_malloc(logical_device->host_allocator(),
-                            sizeof(*command_buffer), (void**)&command_buffer);
+  iree_status_t status = iree_allocator_malloc(
+      logical_device->host_allocator(),
+      sizeof(*command_buffer) +
+          iree_hal_command_buffer_validation_state_size(mode, binding_capacity),
+      (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
         device, mode, command_categories, queue_affinity, binding_capacity,
+        (uint8_t*)command_buffer + sizeof(*command_buffer),
         &iree_hal_vulkan_direct_command_buffer_vtable, &command_buffer->base);
     command_buffer->logical_device = logical_device;
     command_buffer->tracing_context = tracing_context;
@@ -350,7 +353,7 @@
   iree_inline_array(VkMemoryBarrier, memory_barrier_infos, memory_barrier_count,
                     host_allocator);
   for (int i = 0; i < memory_barrier_count; ++i) {
-    const auto& memory_barrier = memory_barriers[i];
+    const iree_hal_memory_barrier_t& memory_barrier = memory_barriers[i];
     VkMemoryBarrier* info = iree_inline_array_at(memory_barrier_infos, i);
     info->sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
     info->pNext = NULL;
@@ -363,7 +366,7 @@
   iree_inline_array(VkBufferMemoryBarrier, buffer_barrier_infos,
                     buffer_barrier_count, host_allocator);
   for (int i = 0; i < buffer_barrier_count; ++i) {
-    const auto& buffer_barrier = buffer_barriers[i];
+    const iree_hal_buffer_barrier_t& buffer_barrier = buffer_barriers[i];
     VkBufferMemoryBarrier* info = iree_inline_array_at(buffer_barrier_infos, i);
     info->sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
     info->pNext = NULL;
@@ -373,9 +376,10 @@
         iree_hal_vulkan_convert_access_mask(buffer_barrier.target_scope);
     info->srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
     info->dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    info->buffer = iree_hal_vulkan_buffer_handle(buffer_barrier.buffer);
-    info->offset = buffer_barrier.offset;
-    info->size = buffer_barrier.length;
+    info->buffer =
+        iree_hal_vulkan_buffer_handle(buffer_barrier.buffer_ref.buffer);
+    info->offset = buffer_barrier.buffer_ref.offset;
+    info->size = buffer_barrier.buffer_ref.length;
   }
 
   command_buffer->syms->vkCmdPipelineBarrier(
@@ -451,7 +455,7 @@
   iree_inline_array(VkMemoryBarrier, memory_barrier_infos, memory_barrier_count,
                     host_allocator);
   for (int i = 0; i < memory_barrier_count; ++i) {
-    const auto& memory_barrier = memory_barriers[i];
+    const iree_hal_memory_barrier_t& memory_barrier = memory_barriers[i];
     VkMemoryBarrier* info = iree_inline_array_at(memory_barrier_infos, i);
     info->sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
     info->pNext = NULL;
@@ -464,7 +468,7 @@
   iree_inline_array(VkBufferMemoryBarrier, buffer_barrier_infos,
                     buffer_barrier_count, host_allocator);
   for (int i = 0; i < buffer_barrier_count; ++i) {
-    const auto& buffer_barrier = buffer_barriers[i];
+    const iree_hal_buffer_barrier_t& buffer_barrier = buffer_barriers[i];
     VkBufferMemoryBarrier* info = iree_inline_array_at(buffer_barrier_infos, i);
     info->sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
     info->pNext = NULL;
@@ -474,9 +478,10 @@
         iree_hal_vulkan_convert_access_mask(buffer_barrier.target_scope);
     info->srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
     info->dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    info->buffer = iree_hal_vulkan_buffer_handle(buffer_barrier.buffer);
-    info->offset = buffer_barrier.offset;
-    info->size = buffer_barrier.length;
+    info->buffer =
+        iree_hal_vulkan_buffer_handle(buffer_barrier.buffer_ref.buffer);
+    info->offset = buffer_barrier.buffer_ref.offset;
+    info->size = buffer_barrier.buffer_ref.length;
   }
 
   command_buffer->syms->vkCmdWaitEvents(
@@ -497,7 +502,8 @@
 }
 
 static iree_status_t iree_hal_vulkan_direct_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   // NOTE: we could use this to prevent queue family transitions.
   return iree_ok_status();
 }
@@ -526,23 +532,25 @@
 
 static iree_status_t iree_hal_vulkan_direct_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   iree_hal_vulkan_direct_command_buffer_t* command_buffer =
       iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
-  VkBuffer target_device_buffer = iree_hal_vulkan_buffer_handle(target_buffer);
+  VkBuffer target_device_buffer =
+      iree_hal_vulkan_buffer_handle(target_ref.buffer);
 
   IREE_VULKAN_TRACE_ZONE_BEGIN(command_buffer->tracing_context,
                                command_buffer->handle);
 
   IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
-      command_buffer->resource_set, 1, &target_buffer));
+      command_buffer->resource_set, 1, &target_ref.buffer));
 
   // vkCmdFillBuffer requires a 4 byte alignment for the offset, pattern, and
   // length. We use a polyfill here that fills the unaligned start and end of
   // fill operations, if needed.
 
+  iree_device_size_t target_offset = target_ref.offset;
+  iree_device_size_t length = target_ref.length;
   if (target_offset % 4 != 0 || length % 4 != 0) {
     // TODO(scotttodd): only restore push constants that have been modified?
     //                  (this can pass uninitialized memory right now, which
@@ -550,7 +558,7 @@
     IREE_RETURN_IF_ERROR(
         command_buffer->builtin_executables->FillBufferUnaligned(
             command_buffer->handle, &(command_buffer->descriptor_set_arena),
-            target_buffer, target_offset, length, pattern, pattern_length,
+            target_ref.buffer, target_offset, length, pattern, pattern_length,
             command_buffer->push_constants_storage));
 
     // Continue using vkCmdFillBuffer below, but only for the inner aligned
@@ -565,7 +573,7 @@
         iree_device_align(target_offset, 4);
     iree_device_size_t target_end = target_offset + length;
     iree_device_size_t rounded_down_target_end = (target_end / 4) * 4;
-    length -= (aligned_target_offset - target_offset) +
+    length -= (aligned_target_offset - target_ref.offset) +
               (target_end - rounded_down_target_end);
     target_offset = aligned_target_offset;
   }
@@ -573,7 +581,7 @@
   if (length > 0) {
     // Note that vkCmdFillBuffer only accepts 4-byte aligned values so we need
     // to splat out our variable-length pattern.
-    target_offset += iree_hal_buffer_byte_offset(target_buffer);
+    target_offset += iree_hal_buffer_byte_offset(target_ref.buffer);
     uint32_t dword_pattern =
         iree_hal_vulkan_splat_pattern(pattern, pattern_length);
     command_buffer->syms->vkCmdFillBuffer(command_buffer->handle,
@@ -589,17 +597,17 @@
 
 static iree_status_t iree_hal_vulkan_direct_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_vulkan_direct_command_buffer_t* command_buffer =
       iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
-  VkBuffer target_device_buffer = iree_hal_vulkan_buffer_handle(target_buffer);
+  VkBuffer target_device_buffer =
+      iree_hal_vulkan_buffer_handle(target_ref.buffer);
 
   IREE_VULKAN_TRACE_ZONE_BEGIN(command_buffer->tracing_context,
                                command_buffer->handle);
 
   IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
-      command_buffer->resource_set, 1, &target_buffer));
+      command_buffer->resource_set, 1, &target_ref.buffer));
 
   // Vulkan only allows updates of <= 65536 because you really, really, really
   // shouldn't do large updates like this (as it wastes command buffer space and
@@ -608,7 +616,9 @@
   // into multiple updates over the entire desired range.
   const auto* source_buffer_ptr =
       static_cast<const uint8_t*>(source_buffer) + source_offset;
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+  iree_device_size_t target_offset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
+  iree_device_size_t length = target_ref.length;
   while (length > 0) {
     iree_device_size_t chunk_length =
         iree_min((iree_device_size_t)65536u, length);
@@ -628,25 +638,27 @@
 
 static iree_status_t iree_hal_vulkan_direct_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   iree_hal_vulkan_direct_command_buffer_t* command_buffer =
       iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
-  VkBuffer source_device_buffer = iree_hal_vulkan_buffer_handle(source_buffer);
-  VkBuffer target_device_buffer = iree_hal_vulkan_buffer_handle(target_buffer);
+  VkBuffer source_device_buffer =
+      iree_hal_vulkan_buffer_handle(source_ref.buffer);
+  VkBuffer target_device_buffer =
+      iree_hal_vulkan_buffer_handle(target_ref.buffer);
 
   IREE_VULKAN_TRACE_ZONE_BEGIN(command_buffer->tracing_context,
                                command_buffer->handle);
 
-  const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+  const iree_hal_buffer_t* buffers[2] = {source_ref.buffer, target_ref.buffer};
   IREE_RETURN_IF_ERROR(
       iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
 
   VkBufferCopy region;
-  region.srcOffset = iree_hal_buffer_byte_offset(source_buffer) + source_offset;
-  region.dstOffset = iree_hal_buffer_byte_offset(target_buffer) + target_offset;
-  region.size = length;
+  region.srcOffset =
+      iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset;
+  region.dstOffset =
+      iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
+  region.size = target_ref.length;
   command_buffer->syms->vkCmdCopyBuffer(command_buffer->handle,
                                         source_device_buffer,
                                         target_device_buffer, 1, &region);
@@ -659,9 +671,8 @@
 
 static iree_status_t iree_hal_vulkan_direct_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "collectives not yet implemented on Vulkan");
 }
@@ -692,8 +703,7 @@
 static iree_status_t iree_hal_vulkan_direct_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   iree_hal_vulkan_direct_command_buffer_t* command_buffer =
       iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
 
@@ -753,12 +763,11 @@
 static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   iree_hal_vulkan_direct_command_buffer_t* command_buffer =
       iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
 
-  const void* resources[2] = {executable, workgroups_buffer};
+  const void* resources[2] = {executable, workgroups_ref.buffer};
   IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
       command_buffer->resource_set, IREE_ARRAYSIZE(resources), resources));
 
@@ -781,8 +790,10 @@
       command_buffer->handle, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_handle);
 
   VkBuffer workgroups_device_buffer =
-      iree_hal_vulkan_buffer_handle(workgroups_buffer);
-  workgroups_offset += iree_hal_buffer_byte_offset(workgroups_buffer);
+      iree_hal_vulkan_buffer_handle(workgroups_ref.buffer);
+  iree_device_size_t workgroups_offset =
+      iree_hal_buffer_byte_offset(workgroups_ref.buffer) +
+      workgroups_ref.offset;
   command_buffer->syms->vkCmdDispatchIndirect(
       command_buffer->handle, workgroups_device_buffer, workgroups_offset);
 
diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
index 23e6d2d..7f362fd 100644
--- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
@@ -31,6 +31,7 @@
 #include "iree/hal/drivers/vulkan/tracing.h"
 #include "iree/hal/drivers/vulkan/util/arena.h"
 #include "iree/hal/drivers/vulkan/util/ref_ptr.h"
+#include "iree/hal/utils/deferred_command_buffer.h"
 #include "iree/hal/utils/file_transfer.h"
 #include "iree/hal/utils/memory_file.h"
 
@@ -1525,6 +1526,16 @@
     iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
 
+  // TODO(indirect-cmd): until implemented through the whole stack we use a
+  // deferred command buffer and then translate that to a concrete Vulkan
+  // command buffer when submitted with bindings.
+  if (binding_capacity > 0) {
+    return iree_hal_deferred_command_buffer_create(
+        base_device, mode, command_categories, binding_capacity,
+        &device->block_pool, iree_hal_device_host_allocator(base_device),
+        out_command_buffer);
+  }
+
   // TODO(scotttodd): revisit queue selection logic and remove this
   //   * the unaligned buffer fill polyfill and tracing timestamp queries may
   //     both insert dispatches into command buffers that at compile time are
@@ -1712,19 +1723,66 @@
     iree_hal_command_buffer_t* const* command_buffers,
     iree_hal_buffer_binding_table_t const* binding_tables) {
   iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+
   // NOTE: today we are not discriminating queues based on command type.
   CommandQueue* queue = iree_hal_vulkan_device_select_queue(
       device, IREE_HAL_COMMAND_CATEGORY_DISPATCH, queue_affinity);
-  iree_hal_submission_batch_t batch = {
-      /*.wait_semaphores=*/wait_semaphore_list,
-      /*.command_buffer_count=*/command_buffer_count,
-      /*.command_buffers=*/command_buffers,
-      /*.signal_semaphores=*/signal_semaphore_list,
-  };
-  IREE_RETURN_IF_ERROR(queue->Submit(1, &batch));
+
+  // TODO(indirect-cmd): today we are using deferred command buffers to emulate
+  // indirect command buffers - this requires that we materialize real command
+  // buffers on demand here. When we natively support them we'll still need to
+  // process the binding table prior to submission but that can be done in a
+  // much more lightweight way depending on our concurrency needs.
+  if (IREE_UNLIKELY(command_buffer_count > 32)) {
+    // Guard the stack allocation, yuck.
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "currently limited to a reasonable number of "
+                            "command buffers per submission");
+  }
+  iree_hal_command_buffer_t** translated_command_buffers =
+      (iree_hal_command_buffer_t**)iree_alloca(
+          sizeof(iree_hal_command_buffer_t*) * command_buffer_count);
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
+    iree_hal_command_buffer_t* command_buffer = command_buffers[i];
+    if (iree_hal_deferred_command_buffer_isa(command_buffers[i])) {
+      iree_hal_command_buffer_t* translated_command_buffer = NULL;
+      status = iree_hal_vulkan_device_create_command_buffer(
+          base_device, iree_hal_command_buffer_mode(command_buffer),
+          iree_hal_command_buffer_allowed_categories(command_buffer),
+          queue_affinity, /*binding_capacity=*/0, &translated_command_buffer);
+      if (iree_status_is_ok(status)) {
+        status = iree_hal_deferred_command_buffer_apply(
+            command_buffer, translated_command_buffer, binding_tables[i]);
+      }
+      translated_command_buffers[i] = translated_command_buffer;
+    } else {
+      translated_command_buffers[i] = command_buffer;
+      iree_hal_command_buffer_retain(command_buffer);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    iree_hal_submission_batch_t batch = {
+        /*.wait_semaphores=*/wait_semaphore_list,
+        /*.command_buffer_count=*/command_buffer_count,
+        /*.command_buffers=*/translated_command_buffers,
+        /*.signal_semaphores=*/signal_semaphore_list,
+    };
+    status = queue->Submit(1, &batch);
+  }
+
+  for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
+    iree_hal_command_buffer_release(translated_command_buffers[i]);
+  }
+
   // HACK: we don't track async resource lifetimes so we have to block.
-  return iree_hal_semaphore_list_wait(signal_semaphore_list,
-                                      iree_infinite_timeout());
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_semaphore_list_wait(signal_semaphore_list,
+                                          iree_infinite_timeout());
+  }
+
+  return status;
 }
 
 static iree_status_t iree_hal_vulkan_device_queue_flush(
diff --git a/runtime/src/iree/hal/local/inline_command_buffer.c b/runtime/src/iree/hal/local/inline_command_buffer.c
index 22760bc..f055e98 100644
--- a/runtime/src/iree/hal/local/inline_command_buffer.c
+++ b/runtime/src/iree/hal/local/inline_command_buffer.c
@@ -85,8 +85,10 @@
       command_buffer->state.packed_binding_lengths;
 }
 
-iree_host_size_t iree_hal_inline_command_buffer_size(void) {
-  return sizeof(iree_hal_inline_command_buffer_t);
+iree_host_size_t iree_hal_inline_command_buffer_size(
+    iree_hal_command_buffer_mode_t mode, iree_host_size_t binding_capacity) {
+  return sizeof(iree_hal_inline_command_buffer_t) +
+         iree_hal_command_buffer_validation_state_size(mode, binding_capacity);
 }
 
 iree_status_t iree_hal_inline_command_buffer_initialize(
@@ -114,7 +116,8 @@
         IREE_STATUS_INVALID_ARGUMENT,
         "indirect command buffers do not support binding tables");
   }
-  if (storage.data_length < iree_hal_inline_command_buffer_size()) {
+  if (storage.data_length <
+      iree_hal_inline_command_buffer_size(mode, binding_capacity)) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "storage must have at least the capacity as "
                             "defined by iree_hal_inline_command_buffer_size");
@@ -128,6 +131,7 @@
 
   iree_hal_command_buffer_initialize(
       device, mode, command_categories, queue_affinity, binding_capacity,
+      (uint8_t*)command_buffer + sizeof(*command_buffer),
       &iree_hal_inline_command_buffer_vtable, &command_buffer->base);
   command_buffer->host_allocator = host_allocator;
   iree_hal_inline_command_buffer_reset(command_buffer);
@@ -157,13 +161,16 @@
 
   uint8_t* storage = NULL;
   iree_status_t status = iree_allocator_malloc(
-      host_allocator, iree_hal_inline_command_buffer_size(), (void**)&storage);
+      host_allocator,
+      iree_hal_inline_command_buffer_size(mode, binding_capacity),
+      (void**)&storage);
   iree_hal_command_buffer_t* command_buffer = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_inline_command_buffer_initialize(
         device, mode, command_categories, queue_affinity, binding_capacity,
         host_allocator,
-        iree_make_byte_span(storage, iree_hal_inline_command_buffer_size()),
+        iree_make_byte_span(storage, iree_hal_inline_command_buffer_size(
+                                         mode, binding_capacity)),
         &command_buffer);
   }
 
@@ -303,7 +310,8 @@
 //===----------------------------------------------------------------------===//
 
 static iree_status_t iree_hal_inline_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   // Could be treated as a cache invalidation as it indicates we won't be using
   // the existing buffer contents again.
   return iree_ok_status();
@@ -315,11 +323,10 @@
 
 static iree_status_t iree_hal_inline_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
-  return iree_hal_buffer_map_fill(target_buffer, target_offset, length, pattern,
-                                  pattern_length);
+  return iree_hal_buffer_map_fill(target_ref.buffer, target_ref.offset,
+                                  target_ref.length, pattern, pattern_length);
 }
 
 //===----------------------------------------------------------------------===//
@@ -328,11 +335,10 @@
 
 static iree_status_t iree_hal_inline_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   return iree_hal_buffer_map_write(
-      target_buffer, target_offset,
-      (const uint8_t*)source_buffer + source_offset, length);
+      target_ref.buffer, target_ref.offset,
+      (const uint8_t*)source_buffer + source_offset, target_ref.length);
 }
 
 //===----------------------------------------------------------------------===//
@@ -341,11 +347,10 @@
 
 static iree_status_t iree_hal_inline_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
-  return iree_hal_buffer_map_copy(source_buffer, source_offset, target_buffer,
-                                  target_offset, length);
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
+  return iree_hal_buffer_map_copy(source_ref.buffer, source_ref.offset,
+                                  target_ref.buffer, target_ref.offset,
+                                  target_ref.length);
 }
 
 //===----------------------------------------------------------------------===//
@@ -354,9 +359,8 @@
 
 static iree_status_t iree_hal_inline_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "collectives not yet implemented on CPU");
 }
@@ -395,8 +399,7 @@
 static iree_status_t iree_hal_inline_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   iree_hal_inline_command_buffer_t* command_buffer =
       iree_hal_inline_command_buffer_cast(base_command_buffer);
 
@@ -408,12 +411,12 @@
   iree_host_size_t binding_base =
       set * IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT;
   for (iree_host_size_t i = 0; i < binding_count; ++i) {
-    if (IREE_UNLIKELY(bindings[i].binding >=
+    if (IREE_UNLIKELY(bindings[i].ordinal >=
                       IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT)) {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "buffer binding index out of bounds");
     }
-    iree_host_size_t binding_ordinal = binding_base + bindings[i].binding;
+    iree_host_size_t binding_ordinal = binding_base + bindings[i].ordinal;
 
     // TODO(benvanik): track mapping so we can properly map/unmap/flush/etc.
     iree_hal_buffer_mapping_t buffer_mapping = {{0}};
@@ -556,13 +559,12 @@
 static iree_status_t iree_hal_inline_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   // TODO(benvanik): track mapping so we can properly map/unmap/flush/etc.
   iree_hal_buffer_mapping_t buffer_mapping = {{0}};
   IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
-      workgroups_buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
-      IREE_HAL_MEMORY_ACCESS_READ, workgroups_offset, 3 * sizeof(uint32_t),
+      workgroups_ref.buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
+      IREE_HAL_MEMORY_ACCESS_READ, workgroups_ref.offset, 3 * sizeof(uint32_t),
       &buffer_mapping));
   iree_hal_vec3_t workgroup_count =
       *(const iree_hal_vec3_t*)buffer_mapping.contents.data;
diff --git a/runtime/src/iree/hal/local/inline_command_buffer.h b/runtime/src/iree/hal/local/inline_command_buffer.h
index d6d0909..b0214ae 100644
--- a/runtime/src/iree/hal/local/inline_command_buffer.h
+++ b/runtime/src/iree/hal/local/inline_command_buffer.h
@@ -17,7 +17,8 @@
 // Returns the size, in bytes, of an inline command buffer.
 // This can be used for arena/stack allocations along with
 // iree_hal_inline_command_buffer_initialize/iree_hal_inline_command_buffer_deinitialize.
-iree_host_size_t iree_hal_inline_command_buffer_size(void);
+iree_host_size_t iree_hal_inline_command_buffer_size(
+    iree_hal_command_buffer_mode_t mode, iree_host_size_t binding_capacity);
 
 // Initializes an inline synchronous one-shot single-threaded command "buffer".
 // This is equivalent to iree_hal_inline_command_buffer_create but uses
diff --git a/runtime/src/iree/hal/utils/deferred_command_buffer.c b/runtime/src/iree/hal/utils/deferred_command_buffer.c
index 3b5cc22..485c2e4 100644
--- a/runtime/src/iree/hal/utils/deferred_command_buffer.c
+++ b/runtime/src/iree/hal/utils/deferred_command_buffer.c
@@ -173,12 +173,15 @@
 
   iree_hal_deferred_command_buffer_t* command_buffer = NULL;
   iree_status_t status = iree_allocator_malloc(
-      host_allocator, sizeof(*command_buffer), (void**)&command_buffer);
+      host_allocator,
+      sizeof(*command_buffer) +
+          iree_hal_command_buffer_validation_state_size(mode, binding_capacity),
+      (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_command_buffer_initialize(
         device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
-        binding_capacity, &iree_hal_deferred_command_buffer_vtable,
-        &command_buffer->base);
+        binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
+        &iree_hal_deferred_command_buffer_vtable, &command_buffer->base);
     command_buffer->host_allocator = host_allocator;
     iree_hal_cmd_list_initialize(block_pool, &command_buffer->cmd_list);
 
@@ -437,20 +440,23 @@
 
 typedef struct iree_hal_cmd_discard_buffer_t {
   iree_hal_cmd_header_t header;
-  iree_hal_buffer_t* buffer;
+  iree_hal_buffer_ref_t buffer_ref;
 } iree_hal_cmd_discard_buffer_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_ref_t buffer_ref) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
-  IREE_RETURN_IF_ERROR(
-      iree_hal_resource_set_insert(command_buffer->resource_set, 1, &buffer));
+  if (buffer_ref.buffer) {
+    IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+        command_buffer->resource_set, 1, &buffer_ref.buffer));
+  }
   iree_hal_cmd_discard_buffer_t* cmd = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
       cmd_list, IREE_HAL_CMD_DISCARD_BUFFER, sizeof(*cmd), (void**)&cmd));
-  cmd->buffer = buffer;
+  cmd->buffer_ref = buffer_ref;
   return iree_ok_status();
 }
 
@@ -459,7 +465,7 @@
     iree_hal_buffer_binding_table_t binding_table,
     const iree_hal_cmd_discard_buffer_t* cmd) {
   return iree_hal_command_buffer_discard_buffer(target_command_buffer,
-                                                cmd->buffer);
+                                                cmd->buffer_ref);
 }
 
 //===----------------------------------------------------------------------===//
@@ -468,17 +474,14 @@
 
 typedef struct iree_hal_cmd_fill_buffer_t {
   iree_hal_cmd_header_t header;
-  iree_hal_buffer_t* target_buffer;
-  iree_device_size_t target_offset;
-  iree_device_size_t length;
+  iree_hal_buffer_ref_t target_ref;
   uint64_t pattern;
   iree_host_size_t pattern_length;
 } iree_hal_cmd_fill_buffer_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_fill_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
+    iree_hal_buffer_ref_t target_ref, const void* pattern,
     iree_host_size_t pattern_length) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
@@ -488,13 +491,13 @@
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "fill patterns must be < 8 bytes");
   }
-  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
-      command_buffer->resource_set, 1, &target_buffer));
+  if (target_ref.buffer) {
+    IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+        command_buffer->resource_set, 1, &target_ref.buffer));
+  }
   IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
       cmd_list, IREE_HAL_CMD_FILL_BUFFER, sizeof(*cmd), (void**)&cmd));
-  cmd->target_buffer = target_buffer;
-  cmd->target_offset = target_offset;
-  cmd->length = length;
+  cmd->target_ref = target_ref;
   memcpy(&cmd->pattern, pattern, pattern_length);
   cmd->pattern_length = pattern_length;
   return iree_ok_status();
@@ -505,8 +508,8 @@
     iree_hal_buffer_binding_table_t binding_table,
     const iree_hal_cmd_fill_buffer_t* cmd) {
   return iree_hal_command_buffer_fill_buffer(
-      target_command_buffer, cmd->target_buffer, cmd->target_offset,
-      cmd->length, (void**)&cmd->pattern, cmd->pattern_length);
+      target_command_buffer, cmd->target_ref, (void**)&cmd->pattern,
+      cmd->pattern_length);
 }
 
 //===----------------------------------------------------------------------===//
@@ -515,30 +518,28 @@
 
 typedef struct iree_hal_cmd_update_buffer_t {
   iree_hal_cmd_header_t header;
-  iree_hal_buffer_t* target_buffer;
-  iree_device_size_t target_offset;
-  iree_device_size_t length;
+  iree_hal_buffer_ref_t target_ref;
   uint8_t source_buffer[];
 } iree_hal_cmd_update_buffer_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
-    iree_device_size_t target_offset, iree_device_size_t length) {
+    iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
-  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
-      command_buffer->resource_set, 1, &target_buffer));
+  if (target_ref.buffer) {
+    IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+        command_buffer->resource_set, 1, &target_ref.buffer));
+  }
   iree_hal_cmd_update_buffer_t* cmd = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
       cmd_list, IREE_HAL_CMD_UPDATE_BUFFER,
-      sizeof(*cmd) + sizeof(cmd->source_buffer[0]) * length, (void**)&cmd));
-  cmd->target_buffer = target_buffer;
-  cmd->target_offset = target_offset;
-  cmd->length = length;
+      sizeof(*cmd) + sizeof(cmd->source_buffer[0]) * target_ref.length,
+      (void**)&cmd));
+  cmd->target_ref = target_ref;
   memcpy(cmd->source_buffer, (const uint8_t*)source_buffer + source_offset,
-         sizeof(cmd->source_buffer[0]) * length);
+         sizeof(cmd->source_buffer[0]) * target_ref.length);
   return iree_ok_status();
 }
 
@@ -547,8 +548,7 @@
     iree_hal_buffer_binding_table_t binding_table,
     const iree_hal_cmd_update_buffer_t* cmd) {
   return iree_hal_command_buffer_update_buffer(
-      target_command_buffer, cmd->source_buffer, 0, cmd->target_buffer,
-      cmd->target_offset, cmd->length);
+      target_command_buffer, cmd->source_buffer, 0, cmd->target_ref);
 }
 
 //===----------------------------------------------------------------------===//
@@ -557,32 +557,33 @@
 
 typedef struct iree_hal_cmd_copy_buffer_t {
   iree_hal_cmd_header_t header;
-  iree_hal_buffer_t* source_buffer;
-  iree_device_size_t source_offset;
-  iree_hal_buffer_t* target_buffer;
-  iree_device_size_t target_offset;
-  iree_device_size_t length;
+  iree_hal_buffer_ref_t source_ref;
+  iree_hal_buffer_ref_t target_ref;
 } iree_hal_cmd_copy_buffer_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_copy_buffer(
     iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length) {
+    iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
-  const void* buffers[2] = {source_buffer, target_buffer};
-  IREE_RETURN_IF_ERROR(
-      iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+  iree_host_size_t resource_count = 0;
+  const void* resources[2] = {NULL, NULL};
+  if (source_ref.buffer) {
+    resources[resource_count++] = source_ref.buffer;
+  }
+  if (target_ref.buffer) {
+    resources[resource_count++] = target_ref.buffer;
+  }
+  if (resource_count > 0) {
+    IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+        command_buffer->resource_set, resource_count, resources));
+  }
   iree_hal_cmd_copy_buffer_t* cmd = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
       cmd_list, IREE_HAL_CMD_COPY_BUFFER, sizeof(*cmd), (void**)&cmd));
-  cmd->source_buffer = source_buffer;
-  cmd->source_offset = source_offset;
-  cmd->target_buffer = target_buffer;
-  cmd->target_offset = target_offset;
-  cmd->length = length;
+  cmd->source_ref = source_ref;
+  cmd->target_ref = target_ref;
   return iree_ok_status();
 }
 
@@ -590,9 +591,8 @@
     iree_hal_command_buffer_t* target_command_buffer,
     iree_hal_buffer_binding_table_t binding_table,
     const iree_hal_cmd_copy_buffer_t* cmd) {
-  return iree_hal_command_buffer_copy_buffer(
-      target_command_buffer, cmd->source_buffer, cmd->source_offset,
-      cmd->target_buffer, cmd->target_offset, cmd->length);
+  return iree_hal_command_buffer_copy_buffer(target_command_buffer,
+                                             cmd->source_ref, cmd->target_ref);
 }
 
 //===----------------------------------------------------------------------===//
@@ -604,24 +604,23 @@
   iree_hal_channel_t* channel;
   iree_hal_collective_op_t op;
   uint32_t param;
-  iree_hal_buffer_binding_t send_binding;
-  iree_hal_buffer_binding_t recv_binding;
+  iree_hal_buffer_ref_t send_ref;
+  iree_hal_buffer_ref_t recv_ref;
   iree_device_size_t element_count;
 } iree_hal_cmd_collective_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_collective(
     iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel,
-    iree_hal_collective_op_t op, uint32_t param,
-    iree_hal_buffer_binding_t send_binding,
-    iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) {
+    iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref,
+    iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
   iree_host_size_t resource_count = 0;
   const void* resources[3] = {NULL, NULL, NULL};
   resources[resource_count++] = channel;
-  if (send_binding.buffer) resources[resource_count++] = send_binding.buffer;
-  if (recv_binding.buffer) resources[resource_count++] = recv_binding.buffer;
+  if (send_ref.buffer) resources[resource_count++] = send_ref.buffer;
+  if (recv_ref.buffer) resources[resource_count++] = recv_ref.buffer;
   IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
       command_buffer->resource_set, resource_count, resources));
   iree_hal_cmd_collective_t* cmd = NULL;
@@ -630,8 +629,8 @@
   cmd->channel = channel;
   cmd->op = op;
   cmd->param = param;
-  cmd->send_binding = send_binding;
-  cmd->recv_binding = recv_binding;
+  cmd->send_ref = send_ref;
+  cmd->recv_ref = recv_ref;
   cmd->element_count = element_count;
   return iree_ok_status();
 }
@@ -640,9 +639,9 @@
     iree_hal_command_buffer_t* target_command_buffer,
     iree_hal_buffer_binding_table_t binding_table,
     const iree_hal_cmd_collective_t* cmd) {
-  return iree_hal_command_buffer_collective(
-      target_command_buffer, cmd->channel, cmd->op, cmd->param,
-      cmd->send_binding, cmd->recv_binding, cmd->element_count);
+  return iree_hal_command_buffer_collective(target_command_buffer, cmd->channel,
+                                            cmd->op, cmd->param, cmd->send_ref,
+                                            cmd->recv_ref, cmd->element_count);
 }
 
 //===----------------------------------------------------------------------===//
@@ -695,14 +694,13 @@
   iree_hal_pipeline_layout_t* pipeline_layout;
   uint32_t set;
   iree_host_size_t binding_count;
-  iree_hal_descriptor_set_binding_t bindings[];
+  iree_hal_buffer_ref_t bindings[];
 } iree_hal_cmd_push_descriptor_set_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_push_descriptor_set(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
-    iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t* bindings) {
+    iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
@@ -716,7 +714,7 @@
   cmd->set = set;
   cmd->binding_count = binding_count;
   for (iree_host_size_t i = 0; i < binding_count; ++i) {
-    iree_hal_descriptor_set_binding_t binding = bindings[i];
+    iree_hal_buffer_ref_t binding = bindings[i];
     cmd->bindings[i] = binding;
     if (binding.buffer) {
       IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
@@ -785,28 +783,30 @@
   iree_hal_cmd_header_t header;
   iree_hal_executable_t* executable;
   int32_t entry_point;
-  iree_hal_buffer_t* workgroups_buffer;
-  iree_device_size_t workgroups_offset;
+  iree_hal_buffer_ref_t workgroups_ref;
 } iree_hal_cmd_dispatch_indirect_t;
 
 static iree_status_t iree_hal_deferred_command_buffer_dispatch_indirect(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_t* workgroups_buffer,
-    iree_device_size_t workgroups_offset) {
+    iree_hal_buffer_ref_t workgroups_ref) {
   iree_hal_deferred_command_buffer_t* command_buffer =
       iree_hal_deferred_command_buffer_cast(base_command_buffer);
   iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
-  const void* resources[2] = {executable, workgroups_buffer};
-  IREE_RETURN_IF_ERROR(
-      iree_hal_resource_set_insert(command_buffer->resource_set, 2, resources));
+  iree_host_size_t resource_count = 0;
+  const void* resources[2] = {NULL, NULL};
+  resources[resource_count++] = executable;
+  if (workgroups_ref.buffer) {
+    resources[resource_count++] = workgroups_ref.buffer;
+  }
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, resource_count, resources));
   iree_hal_cmd_dispatch_indirect_t* cmd = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
       cmd_list, IREE_HAL_CMD_DISPATCH_INDIRECT, sizeof(*cmd), (void**)&cmd));
   cmd->executable = executable;
   cmd->entry_point = entry_point;
-  cmd->workgroups_buffer = workgroups_buffer;
-  cmd->workgroups_offset = workgroups_offset;
+  cmd->workgroups_ref = workgroups_ref;
   return iree_ok_status();
 }
 
@@ -816,7 +816,7 @@
     const iree_hal_cmd_dispatch_indirect_t* cmd) {
   return iree_hal_command_buffer_dispatch_indirect(
       target_command_buffer, cmd->executable, cmd->entry_point,
-      cmd->workgroups_buffer, cmd->workgroups_offset);
+      cmd->workgroups_ref);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/runtime/src/iree/io/parameter_index_provider.c b/runtime/src/iree/io/parameter_index_provider.c
index c144bfd..ff8ac7d 100644
--- a/runtime/src/iree/io/parameter_index_provider.c
+++ b/runtime/src/iree/io/parameter_index_provider.c
@@ -508,9 +508,10 @@
   // Parameter ranges cannot overlap so there's no barrier required.
   batch->transfer_bytes_outstanding += length;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_command_buffer_fill_buffer(batch->transfer_command_buffer,
-                                              buffer, buffer_offset, length,
-                                              pattern, pattern_length));
+      z0, iree_hal_command_buffer_fill_buffer(
+              batch->transfer_command_buffer,
+              iree_hal_make_buffer_ref(buffer, buffer_offset, length), pattern,
+              pattern_length));
 
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
diff --git a/runtime/src/iree/modules/check/module.cc b/runtime/src/iree/modules/check/module.cc
index 039d2d5..7159e57 100644
--- a/runtime/src/iree/modules/check/module.cc
+++ b/runtime/src/iree/modules/check/module.cc
@@ -205,8 +205,9 @@
         iree_hal_device_allocator(device), target_params, buffer_length,
         &target_buffer));
     IREE_RETURN_IF_ERROR(iree_hal_command_buffer_copy_buffer(
-        command_buffer.get(), source_buffer, 0, target_buffer.get(), 0,
-        buffer_length));
+        command_buffer.get(),
+        iree_hal_make_buffer_ref(source_buffer, 0, buffer_length),
+        iree_hal_make_buffer_ref(target_buffer.get(), 0, buffer_length)));
     vm::ref<iree_hal_buffer_view_t> target_view;
     IREE_RETURN_IF_ERROR(iree_hal_buffer_view_create_like(
         target_buffer.get(), source_views[i].get(),
diff --git a/runtime/src/iree/modules/hal/module.c b/runtime/src/iree/modules/hal/module.c
index 790c616..1b9f8df 100644
--- a/runtime/src/iree/modules/hal/module.c
+++ b/runtime/src/iree/modules/hal/module.c
@@ -768,9 +768,10 @@
   uint32_t pattern = (uint32_t)args->i4;
   uint32_t pattern_length = (uint32_t)args->i5;
 
-  return iree_hal_command_buffer_fill_buffer(command_buffer, target_buffer,
-                                             target_offset, length, &pattern,
-                                             pattern_length);
+  iree_hal_buffer_ref_t target_ref =
+      iree_hal_make_buffer_ref(target_buffer, target_offset, length);
+  return iree_hal_command_buffer_fill_buffer(command_buffer, target_ref,
+                                             &pattern, pattern_length);
 }
 
 IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_copy_buffer,  //
@@ -787,9 +788,12 @@
   iree_device_size_t target_offset = iree_hal_cast_device_size(args->i4);
   iree_device_size_t length = iree_hal_cast_device_size(args->i5);
 
-  return iree_hal_command_buffer_copy_buffer(command_buffer, source_buffer,
-                                             source_offset, target_buffer,
-                                             target_offset, length);
+  iree_hal_buffer_ref_t source_ref =
+      iree_hal_make_buffer_ref(source_buffer, source_offset, length);
+  iree_hal_buffer_ref_t target_ref =
+      iree_hal_make_buffer_ref(target_buffer, target_offset, length);
+  return iree_hal_command_buffer_copy_buffer(command_buffer, source_ref,
+                                             target_ref);
 }
 
 IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_collective,  //
@@ -802,25 +806,20 @@
   IREE_RETURN_IF_ERROR(iree_hal_channel_check_deref(args->r1, &channel));
   iree_hal_collective_op_t op = {.packed = args->i2};
   uint32_t param = args->i3;
-  iree_hal_buffer_binding_t send_binding = {
-      .buffer = NULL,
-      .offset = iree_hal_cast_device_size(args->i5),
-      .length = iree_hal_cast_device_size(args->i6),
-  };
+  iree_hal_buffer_ref_t send_ref =
+      iree_hal_make_buffer_ref(NULL, iree_hal_cast_device_size(args->i5),
+                               iree_hal_cast_device_size(args->i6));
   IREE_RETURN_IF_ERROR(
-      iree_hal_buffer_check_deref_or_null(args->r4, &send_binding.buffer));
-  iree_hal_buffer_binding_t recv_binding = {
-      .buffer = NULL,
-      .offset = iree_hal_cast_device_size(args->i8),
-      .length = iree_hal_cast_device_size(args->i9),
-  };
+      iree_hal_buffer_check_deref_or_null(args->r4, &send_ref.buffer));
+  iree_hal_buffer_ref_t recv_ref =
+      iree_hal_make_buffer_ref(NULL, iree_hal_cast_device_size(args->i8),
+                               iree_hal_cast_device_size(args->i9));
   IREE_RETURN_IF_ERROR(
-      iree_hal_buffer_check_deref_or_null(args->r7, &recv_binding.buffer));
+      iree_hal_buffer_check_deref_or_null(args->r7, &recv_ref.buffer));
   iree_device_size_t element_count = iree_hal_cast_device_size(args->i10);
 
   return iree_hal_command_buffer_collective(command_buffer, channel, op, param,
-                                            send_binding, recv_binding,
-                                            element_count);
+                                            send_ref, recv_ref, element_count);
 }
 
 IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_push_constants,  //
@@ -859,11 +858,10 @@
         IREE_STATUS_OUT_OF_RANGE, "binding count %" PRIhsz " > %" PRIhsz,
         binding_count, IREE_HAL_MODULE_MAX_DESCRIPTOR_BINDING_COUNT);
   }
-  iree_hal_descriptor_set_binding_t* bindings =
-      (iree_hal_descriptor_set_binding_t*)iree_alloca(
-          binding_count * sizeof(iree_hal_descriptor_set_binding_t));
+  iree_hal_buffer_ref_t* bindings = (iree_hal_buffer_ref_t*)iree_alloca(
+      binding_count * sizeof(iree_hal_buffer_ref_t));
   for (iree_host_size_t i = 0; i < binding_count; ++i) {
-    bindings[i].binding = (uint32_t)args->a3[i].i0;
+    bindings[i].ordinal = (uint32_t)args->a3[i].i0;
     bindings[i].buffer_slot = (uint32_t)args->a3[i].i1;
     IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref_or_null(
         args->a3[i].r2, &bindings[i].buffer));
@@ -907,9 +905,10 @@
       iree_hal_buffer_check_deref(args->r3, &workgroups_buffer));
   iree_device_size_t workgroups_offset = iree_hal_cast_device_size(args->i4);
 
-  return iree_hal_command_buffer_dispatch_indirect(
-      command_buffer, executable, entry_point, workgroups_buffer,
-      workgroups_offset);
+  iree_hal_buffer_ref_t workgroups_ref = iree_hal_make_buffer_ref(
+      workgroups_buffer, workgroups_offset, 3 * sizeof(uint32_t));
+  return iree_hal_command_buffer_dispatch_indirect(command_buffer, executable,
+                                                   entry_point, workgroups_ref);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/runtime/src/iree/tooling/function_util.c b/runtime/src/iree/tooling/function_util.c
index aa06445..6d63c8d 100644
--- a/runtime/src/iree/tooling/function_util.c
+++ b/runtime/src/iree/tooling/function_util.c
@@ -82,8 +82,11 @@
               iree_hal_buffer_allocation_size(source_buffer), &target_buffer));
 
   iree_status_t status = iree_hal_command_buffer_copy_buffer(
-      command_buffer, source_buffer, 0, target_buffer, 0,
-      iree_hal_buffer_byte_length(source_buffer));
+      command_buffer,
+      iree_hal_make_buffer_ref(source_buffer, 0,
+                               iree_hal_buffer_byte_length(source_buffer)),
+      iree_hal_make_buffer_ref(target_buffer, 0,
+                               iree_hal_buffer_byte_length(source_buffer)));
 
   if (iree_status_is_ok(status)) {
     *out_target_buffer = target_buffer;
diff --git a/tools/iree-benchmark-executable-main.c b/tools/iree-benchmark-executable-main.c
index 2fd1066..f3a0b14 100644
--- a/tools/iree-benchmark-executable-main.c
+++ b/tools/iree-benchmark-executable-main.c
@@ -206,7 +206,7 @@
   iree_hal_device_t* device;
   iree_hal_executable_t* executable;
   iree_hal_pipeline_layout_t* pipeline_layout;
-  const iree_hal_descriptor_set_binding_t* bindings;
+  const iree_hal_buffer_ref_t* bindings;
   uint32_t workgroup_count[3];
 } iree_benchmark_executable_args_t;
 
@@ -388,7 +388,7 @@
       (iree_string_view_list_t){parsed_params.binding_count,
                                 parsed_params.binding_specs},
       device, device_allocator, host_allocator, &binding_list));
-  iree_hal_descriptor_set_binding_t bindings[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
+  iree_hal_buffer_ref_t bindings[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
   for (iree_host_size_t i = 0; i < parsed_params.binding_count; ++i) {
     iree_vm_ref_t value = iree_vm_ref_null();
     IREE_RETURN_IF_ERROR(iree_vm_list_get_ref_assign(binding_list, i, &value));
@@ -404,13 +404,8 @@
           " is not",
           i);
     }
-    bindings[i] = (iree_hal_descriptor_set_binding_t){
-        .binding = i,
-        .buffer_slot = 0,
-        .buffer = buffer,
-        .offset = 0,
-        .length = IREE_WHOLE_BUFFER,
-    };
+    bindings[i] = iree_hal_make_buffer_ref(buffer, 0, IREE_WHOLE_BUFFER);
+    bindings[i].ordinal = i;
   }
 
   // Setup the specification used to perform the executable load.