Adding iree_hal_command_buffer_push/pop_debug_group. (#6474)

This will let us insert debug markers and trace events around arbitrary
sets of commands. Currently only Vulkan plumbs these through.
diff --git a/experimental/rocm/direct_command_buffer.c b/experimental/rocm/direct_command_buffer.c
index d75c060..3d8706a 100644
--- a/experimental/rocm/direct_command_buffer.c
+++ b/experimental/rocm/direct_command_buffer.c
@@ -23,13 +23,13 @@
 
 typedef struct {
   iree_hal_resource_t resource;
-  iree_hal_rocm_context_wrapper_t *context;
+  iree_hal_rocm_context_wrapper_t* context;
   iree_hal_command_buffer_mode_t mode;
   iree_hal_command_category_t allowed_categories;
   iree_hal_queue_affinity_t queue_affinity;
   size_t total_size;
   // Keep track of the current set of kernel arguments.
-  void *current_descriptor[];
+  void* current_descriptor[];
 } iree_hal_rocm_direct_command_buffer_t;
 
 #define IREE_HAL_ROCM_MAX_BINDING_COUNT 64
@@ -37,29 +37,29 @@
 extern const iree_hal_command_buffer_vtable_t
     iree_hal_rocm_direct_command_buffer_vtable;
 
-static iree_hal_rocm_direct_command_buffer_t *
+static iree_hal_rocm_direct_command_buffer_t*
 iree_hal_rocm_direct_command_buffer_cast(
-    iree_hal_command_buffer_t *base_value) {
+    iree_hal_command_buffer_t* base_value) {
   IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_direct_command_buffer_vtable);
-  return (iree_hal_rocm_direct_command_buffer_t *)base_value;
+  return (iree_hal_rocm_direct_command_buffer_t*)base_value;
 }
 
 iree_status_t iree_hal_rocm_direct_command_buffer_allocate(
-    iree_hal_rocm_context_wrapper_t *context,
+    iree_hal_rocm_context_wrapper_t* context,
     iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity,
-    iree_hal_command_buffer_t **out_command_buffer) {
+    iree_hal_command_buffer_t** out_command_buffer) {
   IREE_ASSERT_ARGUMENT(context);
   IREE_ASSERT_ARGUMENT(out_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_hal_rocm_direct_command_buffer_t *command_buffer = NULL;
+  iree_hal_rocm_direct_command_buffer_t* command_buffer = NULL;
   size_t total_size = sizeof(*command_buffer) +
-                      IREE_HAL_ROCM_MAX_BINDING_COUNT * sizeof(void *) +
+                      IREE_HAL_ROCM_MAX_BINDING_COUNT * sizeof(void*) +
                       IREE_HAL_ROCM_MAX_BINDING_COUNT * sizeof(hipDeviceptr_t);
   iree_status_t status = iree_allocator_malloc(
-      context->host_allocator, total_size, (void **)&command_buffer);
+      context->host_allocator, total_size, (void**)&command_buffer);
   if (iree_status_is_ok(status)) {
     iree_hal_resource_initialize(&iree_hal_rocm_direct_command_buffer_vtable,
                                  &command_buffer->resource);
@@ -67,15 +67,15 @@
     command_buffer->mode = mode;
     command_buffer->allowed_categories = command_categories;
     command_buffer->queue_affinity = queue_affinity;
-    hipDeviceptr_t *device_ptrs =
-        (hipDeviceptr_t *)(command_buffer->current_descriptor +
-                           IREE_HAL_ROCM_MAX_BINDING_COUNT);
+    hipDeviceptr_t* device_ptrs =
+        (hipDeviceptr_t*)(command_buffer->current_descriptor +
+                          IREE_HAL_ROCM_MAX_BINDING_COUNT);
     for (size_t i = 0; i < IREE_HAL_ROCM_MAX_BINDING_COUNT; i++) {
       command_buffer->current_descriptor[i] = &device_ptrs[i];
     }
     command_buffer->total_size = total_size;
 
-    *out_command_buffer = (iree_hal_command_buffer_t *)command_buffer;
+    *out_command_buffer = (iree_hal_command_buffer_t*)command_buffer;
   }
 
   IREE_TRACE_ZONE_END(z0);
@@ -83,8 +83,8 @@
 }
 
 static void iree_hal_rocm_direct_command_buffer_destroy(
-    iree_hal_command_buffer_t *base_command_buffer) {
-  iree_hal_rocm_direct_command_buffer_t *command_buffer =
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_rocm_direct_command_buffer_t* command_buffer =
       iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -94,91 +94,103 @@
 }
 
 static iree_hal_command_buffer_mode_t iree_hal_rocm_direct_command_buffer_mode(
-    const iree_hal_command_buffer_t *base_command_buffer) {
-  const iree_hal_rocm_direct_command_buffer_t *command_buffer =
-      (const iree_hal_rocm_direct_command_buffer_t *)(base_command_buffer);
+    const iree_hal_command_buffer_t* base_command_buffer) {
+  const iree_hal_rocm_direct_command_buffer_t* command_buffer =
+      (const iree_hal_rocm_direct_command_buffer_t*)(base_command_buffer);
   return command_buffer->mode;
 }
 
 static iree_hal_command_category_t
 iree_hal_rocm_direct_command_buffer_allowed_categories(
-    const iree_hal_command_buffer_t *base_command_buffer) {
-  const iree_hal_rocm_direct_command_buffer_t *command_buffer =
-      (const iree_hal_rocm_direct_command_buffer_t *)(base_command_buffer);
+    const iree_hal_command_buffer_t* base_command_buffer) {
+  const iree_hal_rocm_direct_command_buffer_t* command_buffer =
+      (const iree_hal_rocm_direct_command_buffer_t*)(base_command_buffer);
   return command_buffer->allowed_categories;
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_begin(
-    iree_hal_command_buffer_t *base_command_buffer) {
+    iree_hal_command_buffer_t* base_command_buffer) {
   return iree_ok_status();
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_end(
-    iree_hal_command_buffer_t *base_command_buffer) {
+    iree_hal_command_buffer_t* base_command_buffer) {
   return iree_ok_status();
 }
 
+static void iree_hal_rocm_direct_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  // TODO(benvanik): tracy event stack.
+}
+
+static void iree_hal_rocm_direct_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  // TODO(benvanik): tracy event stack.
+}
+
 static iree_status_t iree_hal_rocm_direct_command_buffer_execution_barrier(
-    iree_hal_command_buffer_t *base_command_buffer,
+    iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_execution_stage_t source_stage_mask,
     iree_hal_execution_stage_t target_stage_mask,
     iree_hal_execution_barrier_flags_t flags,
     iree_host_size_t memory_barrier_count,
-    const iree_hal_memory_barrier_t *memory_barriers,
+    const iree_hal_memory_barrier_t* memory_barriers,
     iree_host_size_t buffer_barrier_count,
-    const iree_hal_buffer_barrier_t *buffer_barriers) {
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
   // TODO: Implement barrier
   return iree_ok_status();
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_signal_event(
-    iree_hal_command_buffer_t *base_command_buffer, iree_hal_event_t *event,
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
     iree_hal_execution_stage_t source_stage_mask) {
   // TODO: Implement barrier
   return iree_ok_status();
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_reset_event(
-    iree_hal_command_buffer_t *base_command_buffer, iree_hal_event_t *event,
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
     iree_hal_execution_stage_t source_stage_mask) {
   // TODO: Implement barrier
   return iree_ok_status();
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_wait_events(
-    iree_hal_command_buffer_t *base_command_buffer,
-    iree_host_size_t event_count, const iree_hal_event_t **events,
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_host_size_t event_count, const iree_hal_event_t** events,
     iree_hal_execution_stage_t source_stage_mask,
     iree_hal_execution_stage_t target_stage_mask,
     iree_host_size_t memory_barrier_count,
-    const iree_hal_memory_barrier_t *memory_barriers,
+    const iree_hal_memory_barrier_t* memory_barriers,
     iree_host_size_t buffer_barrier_count,
-    const iree_hal_buffer_barrier_t *buffer_barriers) {
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
   // TODO: Implement barrier
   return iree_ok_status();
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_discard_buffer(
-    iree_hal_command_buffer_t *base_command_buffer, iree_hal_buffer_t *buffer) {
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
   // nothing to do.
   return iree_ok_status();
 }
 
 // Splats a pattern value of 1, 2, or 4 bytes out to a 4 byte value.
-static uint32_t iree_hal_rocm_splat_pattern(const void *pattern,
+static uint32_t iree_hal_rocm_splat_pattern(const void* pattern,
                                             size_t pattern_length) {
   switch (pattern_length) {
     case 1: {
-      uint32_t pattern_value = *(const uint8_t *)(pattern);
+      uint32_t pattern_value = *(const uint8_t*)(pattern);
       return (pattern_value << 24) | (pattern_value << 16) |
              (pattern_value << 8) | pattern_value;
     }
     case 2: {
-      uint32_t pattern_value = *(const uint16_t *)(pattern);
+      uint32_t pattern_value = *(const uint16_t*)(pattern);
       return (pattern_value << 16) | pattern_value;
     }
     case 4: {
-      uint32_t pattern_value = *(const uint32_t *)(pattern);
+      uint32_t pattern_value = *(const uint32_t*)(pattern);
       return pattern_value;
     }
     default:
@@ -187,11 +199,11 @@
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_fill_buffer(
-    iree_hal_command_buffer_t *base_command_buffer,
-    iree_hal_buffer_t *target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void *pattern,
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
     iree_host_size_t pattern_length) {
-  iree_hal_rocm_direct_command_buffer_t *command_buffer =
+  iree_hal_rocm_direct_command_buffer_t* command_buffer =
       iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
 
   hipDeviceptr_t target_device_buffer = iree_hal_rocm_buffer_device_pointer(
@@ -210,19 +222,19 @@
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_update_buffer(
-    iree_hal_command_buffer_t *base_command_buffer, const void *source_buffer,
-    iree_host_size_t source_offset, iree_hal_buffer_t *target_buffer,
+    iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
     iree_device_size_t target_offset, iree_device_size_t length) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "need rocm implementation");
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_copy_buffer(
-    iree_hal_command_buffer_t *base_command_buffer,
-    iree_hal_buffer_t *source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t *target_buffer, iree_device_size_t target_offset,
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length) {
-  iree_hal_rocm_direct_command_buffer_t *command_buffer =
+  iree_hal_rocm_direct_command_buffer_t* command_buffer =
       iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
 
   hipDeviceptr_t target_device_buffer = iree_hal_rocm_buffer_device_pointer(
@@ -242,9 +254,9 @@
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_push_constants(
-    iree_hal_command_buffer_t *base_command_buffer,
-    iree_hal_executable_layout_t *executable_layout, iree_host_size_t offset,
-    const void *values, iree_host_size_t values_length) {
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "need rocm implementation");
 }
@@ -256,20 +268,20 @@
 } iree_hal_rocm_binding_mapping_t;
 
 // Helper to sort the binding based on their binding index.
-static int compare_binding_index(const void *a, const void *b) {
+static int compare_binding_index(const void* a, const void* b) {
   const iree_hal_rocm_binding_mapping_t buffer_a =
-      *(const iree_hal_rocm_binding_mapping_t *)a;
+      *(const iree_hal_rocm_binding_mapping_t*)a;
   const iree_hal_rocm_binding_mapping_t buffer_b =
-      *(const iree_hal_rocm_binding_mapping_t *)b;
+      *(const iree_hal_rocm_binding_mapping_t*)b;
   return buffer_a.binding < buffer_b.binding ? -1 : 1;
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_push_descriptor_set(
-    iree_hal_command_buffer_t *base_command_buffer,
-    iree_hal_executable_layout_t *executable_layout, uint32_t set,
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
     iree_host_size_t binding_count,
-    const iree_hal_descriptor_set_binding_t *bindings) {
-  iree_hal_rocm_direct_command_buffer_t *command_buffer =
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  iree_hal_rocm_direct_command_buffer_t* command_buffer =
       iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
   // Convention with the compiler side. We map bindings to kernel argument.
   // We compact the bindings to get a dense set of arguments and keep them order
@@ -291,26 +303,26 @@
         iree_hal_rocm_buffer_device_pointer(
             iree_hal_buffer_allocated_buffer(binding.buffer)) +
         iree_hal_buffer_byte_offset(binding.buffer) + binding.offset;
-    *((hipDeviceptr_t *)command_buffer->current_descriptor[i]) = device_ptr;
+    *((hipDeviceptr_t*)command_buffer->current_descriptor[i]) = device_ptr;
   }
   return iree_ok_status();
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_bind_descriptor_set(
-    iree_hal_command_buffer_t *base_command_buffer,
-    iree_hal_executable_layout_t *executable_layout, uint32_t set,
-    iree_hal_descriptor_set_t *descriptor_set,
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
     iree_host_size_t dynamic_offset_count,
-    const iree_device_size_t *dynamic_offsets) {
+    const iree_device_size_t* dynamic_offsets) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "need rocm implementation");
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_dispatch(
-    iree_hal_command_buffer_t *base_command_buffer,
-    iree_hal_executable_t *executable, int32_t entry_point,
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
     uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
-  iree_hal_rocm_direct_command_buffer_t *command_buffer =
+  iree_hal_rocm_direct_command_buffer_t* command_buffer =
       iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
   iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
 
@@ -332,9 +344,9 @@
 }
 
 static iree_status_t iree_hal_rocm_direct_command_buffer_dispatch_indirect(
-    iree_hal_command_buffer_t *base_command_buffer,
-    iree_hal_executable_t *executable, int32_t entry_point,
-    iree_hal_buffer_t *workgroups_buffer,
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
     iree_device_size_t workgroups_offset) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "need rocm implementation");
@@ -348,6 +360,9 @@
             iree_hal_rocm_direct_command_buffer_allowed_categories,
         .begin = iree_hal_rocm_direct_command_buffer_begin,
         .end = iree_hal_rocm_direct_command_buffer_end,
+        .begin_debug_group =
+            iree_hal_rocm_direct_command_buffer_begin_debug_group,
+        .end_debug_group = iree_hal_rocm_direct_command_buffer_end_debug_group,
         .execution_barrier =
             iree_hal_rocm_direct_command_buffer_execution_barrier,
         .signal_event = iree_hal_rocm_direct_command_buffer_signal_event,
diff --git a/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertCommandBufferOps.cpp b/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertCommandBufferOps.cpp
index 3ff4c80..0ac4bc9 100644
--- a/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertCommandBufferOps.cpp
+++ b/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertCommandBufferOps.cpp
@@ -74,6 +74,14 @@
   patterns.insert<VMImportOpConversion<IREE::HAL::CommandBufferEndOp>>(
       context, importSymbols, typeConverter, "hal.command_buffer.end");
   patterns
+      .insert<VMImportOpConversion<IREE::HAL::CommandBufferBeginDebugGroupOp>>(
+          context, importSymbols, typeConverter,
+          "hal.command_buffer.begin_debug_group");
+  patterns
+      .insert<VMImportOpConversion<IREE::HAL::CommandBufferEndDebugGroupOp>>(
+          context, importSymbols, typeConverter,
+          "hal.command_buffer.end_debug_group");
+  patterns
       .insert<VMImportOpConversion<IREE::HAL::CommandBufferExecutionBarrierOp>>(
           context, importSymbols, typeConverter,
           "hal.command_buffer.execution_barrier");
diff --git a/iree/compiler/Dialect/HAL/IR/HALOps.td b/iree/compiler/Dialect/HAL/IR/HALOps.td
index 158182a..cc68781 100644
--- a/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -1125,6 +1125,43 @@
   let hasCanonicalizer = 1;
 }
 
+def HAL_CommandBufferBeginDebugGroupOp : HAL_Op<"command_buffer.begin_debug_group"> {
+  let summary = [{pushes a command buffer debug group label}];
+  let description = [{
+    Pushes a new debug group with the given label.
+    All commands between this and a mandatory matching call to
+    `hal.command_buffer.end_debug_group` will be grouped together with the
+    given label.
+  }];
+
+  let arguments = (ins
+    HAL_CommandBuffer:$command_buffer,
+    StrAttr:$label
+  );
+
+  let assemblyFormat = [{
+    `<` $command_buffer `:` type($command_buffer) `>`
+    `label` `(` $label `)`
+    attr-dict-with-keyword
+  }];
+}
+
+def HAL_CommandBufferEndDebugGroupOp : HAL_Op<"command_buffer.end_debug_group"> {
+  let summary = [{pops a command buffer debug group label}];
+  let description = [{
+    Pops a debug group from the stack.
+  }];
+
+  let arguments = (ins
+    HAL_CommandBuffer:$command_buffer
+  );
+
+  let assemblyFormat = [{
+    `<` $command_buffer `:` type($command_buffer) `>`
+    attr-dict-with-keyword
+  }];
+}
+
 def HAL_CommandBufferExecutionBarrierOp : HAL_Op<"command_buffer.execution_barrier"> {
   let summary = [{command buffer execution barrier recording operation}];
   let description = [{
diff --git a/iree/compiler/Dialect/HAL/hal.imports.mlir b/iree/compiler/Dialect/HAL/hal.imports.mlir
index 2470eab..c51db3f 100644
--- a/iree/compiler/Dialect/HAL/hal.imports.mlir
+++ b/iree/compiler/Dialect/HAL/hal.imports.mlir
@@ -141,6 +141,17 @@
   %command_buffer : !vm.ref<!hal.command_buffer>
 )
 
+// Pushes a new debug group with the given |label|.
+vm.import @command_buffer.begin_debug_group(
+  %command_buffer : !vm.ref<!hal.command_buffer>,
+  %label : !vm.buffer
+)
+
+// Pops a debug group from the stack.
+vm.import @command_buffer.end_debug_group(
+  %command_buffer : !vm.ref<!hal.command_buffer>
+)
+
 // Defines an execution dependency between all commands recorded before the
 // barrier and all commands recorded after the barrier. Only the stages provided
 // will be affected.
diff --git a/iree/hal/command_buffer.c b/iree/hal/command_buffer.c
index 99b4758..c416684 100644
--- a/iree/hal/command_buffer.c
+++ b/iree/hal/command_buffer.c
@@ -79,6 +79,22 @@
   return status;
 }
 
+IREE_API_EXPORT void iree_hal_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  _VTABLE_DISPATCH(command_buffer, begin_debug_group)
+  (command_buffer, label, label_color, location);
+}
+
+IREE_API_EXPORT void iree_hal_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* command_buffer) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  _VTABLE_DISPATCH(command_buffer, end_debug_group)
+  (command_buffer);
+}
+
 IREE_API_EXPORT iree_status_t iree_hal_command_buffer_execution_barrier(
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_execution_stage_t source_stage_mask,
diff --git a/iree/hal/command_buffer.h b/iree/hal/command_buffer.h
index 38d7242..82b3deb 100644
--- a/iree/hal/command_buffer.h
+++ b/iree/hal/command_buffer.h
@@ -182,6 +182,26 @@
   iree_device_size_t length;
 } iree_hal_buffer_barrier_t;
 
+// An RGBA color.
+typedef struct iree_hal_label_color_t {
+  uint8_t r;
+  uint8_t g;
+  uint8_t b;
+  uint8_t a;
+} iree_hal_label_color_t;
+
+// A source location attached to debug labels.
+typedef struct iree_hal_label_location_t {
+  iree_string_view_t file;
+  int line;
+} iree_hal_label_location_t;
+
+// An unspecified color; debugging tools are to choose their own.
+static inline iree_hal_label_color_t iree_hal_label_color_unspecified() {
+  iree_hal_label_color_t color = {0, 0, 0, 0};
+  return color;
+}
+
 // TODO(benvanik): replace with tables for iree_string_builder_*.
 #define iree_hal_command_buffer_mode_string(...) "TODO"
 //    {IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, "ONE_SHOT"},
@@ -262,6 +282,25 @@
 IREE_API_EXPORT iree_status_t
 iree_hal_command_buffer_end(iree_hal_command_buffer_t* command_buffer);
 
+// Pushes a new debug group with the given |label|.
+// All commands between this and a mandatory matching call to
+// iree_hal_command_buffer_end_debug_group will be grouped together with the
+// given label. If a source location is available it can be provided via
+// |location| to allow mapping back into the source program that issued the
+// commands.
+//
+// An optional RGBA color to show in the debug UI may be provided via
+// |label_color|; otherwise iree_hal_label_color_unspecified can be used to let
+// the debug tool choose.
+IREE_API_EXPORT void iree_hal_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location);
+
+// Pops a debug group from the stack.
+IREE_API_EXPORT void iree_hal_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* command_buffer);
+
 // Defines a memory dependency between commands recorded before and after the
 // barrier. One or more memory or buffer barriers can be specified to indicate
 // between which stages or buffers the dependencies exist.
@@ -468,6 +507,13 @@
   iree_status_t(IREE_API_PTR* begin)(iree_hal_command_buffer_t* command_buffer);
   iree_status_t(IREE_API_PTR* end)(iree_hal_command_buffer_t* command_buffer);
 
+  void(IREE_API_PTR* begin_debug_group)(
+      iree_hal_command_buffer_t* command_buffer, iree_string_view_t label,
+      iree_hal_label_color_t label_color,
+      const iree_hal_label_location_t* location);
+  void(IREE_API_PTR* end_debug_group)(
+      iree_hal_command_buffer_t* command_buffer);
+
   iree_status_t(IREE_API_PTR* execution_barrier)(
       iree_hal_command_buffer_t* command_buffer,
       iree_hal_execution_stage_t source_stage_mask,
diff --git a/iree/hal/cuda/graph_command_buffer.c b/iree/hal/cuda/graph_command_buffer.c
index 9b7fdc9..1b38e9d 100644
--- a/iree/hal/cuda/graph_command_buffer.c
+++ b/iree/hal/cuda/graph_command_buffer.c
@@ -164,6 +164,18 @@
   return iree_ok_status();
 }
 
+static void iree_hal_cuda_graph_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  // TODO(benvanik): tracy event stack.
+}
+
+static void iree_hal_cuda_graph_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  // TODO(benvanik): tracy event stack.
+}
+
 static iree_status_t iree_hal_cuda_graph_command_buffer_execution_barrier(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_execution_stage_t source_stage_mask,
@@ -435,6 +447,9 @@
             iree_hal_cuda_graph_command_buffer_allowed_categories,
         .begin = iree_hal_cuda_graph_command_buffer_begin,
         .end = iree_hal_cuda_graph_command_buffer_end,
+        .begin_debug_group =
+            iree_hal_cuda_graph_command_buffer_begin_debug_group,
+        .end_debug_group = iree_hal_cuda_graph_command_buffer_end_debug_group,
         .execution_barrier =
             iree_hal_cuda_graph_command_buffer_execution_barrier,
         .signal_event = iree_hal_cuda_graph_command_buffer_signal_event,
diff --git a/iree/hal/local/inline_command_buffer.c b/iree/hal/local/inline_command_buffer.c
index 5cf10d1..3fd5692 100644
--- a/iree/hal/local/inline_command_buffer.c
+++ b/iree/hal/local/inline_command_buffer.c
@@ -172,6 +172,22 @@
 }
 
 //===----------------------------------------------------------------------===//
+// iree_hal_inline_command_buffer_t debug utilities
+//===----------------------------------------------------------------------===//
+
+static void iree_hal_inline_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  // TODO(benvanik): tracy event stack.
+}
+
+static void iree_hal_inline_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  // TODO(benvanik): tracy event stack.
+}
+
+//===----------------------------------------------------------------------===//
 // iree_hal_command_buffer_execution_barrier
 //===----------------------------------------------------------------------===//
 
@@ -484,6 +500,8 @@
         .allowed_categories = iree_hal_inline_command_buffer_allowed_categories,
         .begin = iree_hal_inline_command_buffer_begin,
         .end = iree_hal_inline_command_buffer_end,
+        .begin_debug_group = iree_hal_inline_command_buffer_begin_debug_group,
+        .end_debug_group = iree_hal_inline_command_buffer_end_debug_group,
         .execution_barrier = iree_hal_inline_command_buffer_execution_barrier,
         .signal_event = iree_hal_inline_command_buffer_signal_event,
         .reset_event = iree_hal_inline_command_buffer_reset_event,
diff --git a/iree/hal/local/task_command_buffer.c b/iree/hal/local/task_command_buffer.c
index b05c86c..2e78bd1 100644
--- a/iree/hal/local/task_command_buffer.c
+++ b/iree/hal/local/task_command_buffer.c
@@ -379,6 +379,22 @@
 }
 
 //===----------------------------------------------------------------------===//
+// iree_hal_task_command_buffer_t debug utilities
+//===----------------------------------------------------------------------===//
+
+static void iree_hal_task_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  // TODO(benvanik): tracy event stack.
+}
+
+static void iree_hal_task_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  // TODO(benvanik): tracy event stack.
+}
+
+//===----------------------------------------------------------------------===//
 // iree_hal_command_buffer_execution_barrier
 //===----------------------------------------------------------------------===//
 
@@ -896,6 +912,8 @@
         .allowed_categories = iree_hal_task_command_buffer_allowed_categories,
         .begin = iree_hal_task_command_buffer_begin,
         .end = iree_hal_task_command_buffer_end,
+        .begin_debug_group = iree_hal_task_command_buffer_begin_debug_group,
+        .end_debug_group = iree_hal_task_command_buffer_end_debug_group,
         .execution_barrier = iree_hal_task_command_buffer_execution_barrier,
         .signal_event = iree_hal_task_command_buffer_signal_event,
         .reset_event = iree_hal_task_command_buffer_reset_event,
diff --git a/iree/hal/vulkan/direct_command_buffer.cc b/iree/hal/vulkan/direct_command_buffer.cc
index d7de16e..96f8eed 100644
--- a/iree/hal/vulkan/direct_command_buffer.cc
+++ b/iree/hal/vulkan/direct_command_buffer.cc
@@ -213,6 +213,49 @@
   return iree_ok_status();
 }
 
+static void iree_hal_vulkan_direct_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL(
+      command_buffer->tracing_context, command_buffer->handle,
+      location ? location->file.data : NULL, location ? location->file.size : 0,
+      location ? location->line : 0, /*func_name=*/NULL, 0, label.data,
+      label.size);
+  if (command_buffer->syms->vkCmdBeginDebugUtilsLabelEXT) {
+    char label_buffer[128];
+    snprintf(label_buffer, sizeof(label_buffer), "%.*s", (int)label.size,
+             label.data);
+    VkDebugUtilsLabelEXT label_info = {
+        /*.sType=*/VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+        /*.pNext=*/NULL,
+        /*.pLabelName=*/label_buffer,
+        /*.color=*/
+        {
+            /*r=*/label_color.r / 255.0f,
+            /*g=*/label_color.g / 255.0f,
+            /*b=*/label_color.b / 255.0f,
+            /*a=*/label_color.a / 255.0f,
+        },
+    };
+    command_buffer->syms->vkCmdBeginDebugUtilsLabelEXT(command_buffer->handle,
+                                                       &label_info);
+  }
+}
+
+static void iree_hal_vulkan_direct_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  if (command_buffer->syms->vkCmdEndDebugUtilsLabelEXT) {
+    command_buffer->syms->vkCmdEndDebugUtilsLabelEXT(command_buffer->handle);
+  }
+  IREE_VULKAN_TRACE_ZONE_END(command_buffer->tracing_context,
+                             command_buffer->handle);
+}
+
 static VkPipelineStageFlags iree_hal_vulkan_convert_pipeline_stage_flags(
     iree_hal_execution_stage_t stage_mask) {
   VkPipelineStageFlags flags = 0;
@@ -681,6 +724,10 @@
         iree_hal_vulkan_direct_command_buffer_allowed_categories,
         /*.begin=*/iree_hal_vulkan_direct_command_buffer_begin,
         /*.end=*/iree_hal_vulkan_direct_command_buffer_end,
+        /*.begin_debug_group=*/
+        iree_hal_vulkan_direct_command_buffer_begin_debug_group,
+        /*.end_debug_group=*/
+        iree_hal_vulkan_direct_command_buffer_end_debug_group,
         /*.execution_barrier=*/
         iree_hal_vulkan_direct_command_buffer_execution_barrier,
         /*.signal_event=*/
diff --git a/iree/modules/hal/exports.inl b/iree/modules/hal/exports.inl
index f137bd7..8d8adc3 100644
--- a/iree/modules/hal/exports.inl
+++ b/iree/modules/hal/exports.inl
@@ -41,12 +41,14 @@
 EXPORT_FN("buffer_view.trace", iree_hal_module_buffer_view_trace, rCrD, v)
 
 EXPORT_FN("command_buffer.begin", iree_hal_module_command_buffer_begin, r, v)
+EXPORT_FN("command_buffer.begin_debug_group", iree_hal_module_command_buffer_begin_debug_group, rr, v)
 EXPORT_FN("command_buffer.bind_descriptor_set", iree_hal_module_command_buffer_bind_descriptor_set, rrirCiD, v)
 EXPORT_FN("command_buffer.copy_buffer", iree_hal_module_command_buffer_copy_buffer, rririi, v)
 EXPORT_FN("command_buffer.create", iree_hal_module_command_buffer_create, rii, r)
 EXPORT_FN("command_buffer.dispatch", iree_hal_module_command_buffer_dispatch, rriiii, v)
 EXPORT_FN("command_buffer.dispatch.indirect", iree_hal_module_command_buffer_dispatch_indirect, rriri, v)
 EXPORT_FN("command_buffer.end", iree_hal_module_command_buffer_end, r, v)
+EXPORT_FN("command_buffer.end_debug_group", iree_hal_module_command_buffer_end_debug_group, r, v)
 EXPORT_FN("command_buffer.execution_barrier", iree_hal_module_command_buffer_execution_barrier, riii, v)
 EXPORT_FN("command_buffer.fill_buffer", iree_hal_module_command_buffer_fill_buffer, rriii, v)
 EXPORT_FN("command_buffer.push_constants", iree_hal_module_command_buffer_push_constants, rriCiD, v)
diff --git a/iree/modules/hal/module.c b/iree/modules/hal/module.c
index caf9a02..61ea37b 100644
--- a/iree/modules/hal/module.c
+++ b/iree/modules/hal/module.c
@@ -556,6 +556,35 @@
   return iree_hal_command_buffer_end(command_buffer);
 }
 
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_begin_debug_group,  //
+                   iree_hal_module_state_t,                           //
+                   rr, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_vm_buffer_t* label = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r1, &label));
+  iree_string_view_t label_str = iree_vm_buffer_as_string(label);
+  // TODO(benvanik): query from VM.
+  iree_hal_label_location_t location = {
+      .file = iree_string_view_empty(),
+      .line = 0,
+  };
+  iree_hal_command_buffer_begin_debug_group(
+      command_buffer, label_str, iree_hal_label_color_unspecified(), &location);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_end_debug_group,  //
+                   iree_hal_module_state_t,                         //
+                   r, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_command_buffer_end_debug_group(command_buffer);
+  return iree_ok_status();
+}
+
 IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_execution_barrier,  //
                    iree_hal_module_state_t,                           //
                    riii, v) {