[hip][cuda] Added finer grained tracing options to hip. (#18180)

This allows us to specify the verbosity at which we want
device-side tracing. Command-buffer level tracing has
a significantly lower overhead if we do not need
individual kernel timing.

---------

Signed-off-by: Andrew Woloszyn <andrew.woloszyn@gmail.com>
diff --git a/runtime/src/iree/hal/drivers/cuda/api.h b/runtime/src/iree/hal/drivers/cuda/api.h
index a53ada0..d402422 100644
--- a/runtime/src/iree/hal/drivers/cuda/api.h
+++ b/runtime/src/iree/hal/drivers/cuda/api.h
@@ -77,15 +77,14 @@
   // Specifies how command buffers are recorded and executed.
   iree_hal_cuda_command_buffer_mode_t command_buffer_mode;
 
-  // Enables tracing of command buffers when IREE tracing is enabled.
-  // May take advantage of additional extensions for more accurate timing or
-  // hardware-specific performance counters.
+  // Controls the verbosity of command buffers tracing when when IREE
+  // tracing is enabled.
   //
   // NOTE: tracing has a non-trivial overhead and will skew the timing of
-  // submissions and introduce false barriers between dispatches. Use this to
-  // identify slow dispatches and refine from there; be wary of whole-program
-  // tracing with this enabled.
-  bool stream_tracing;
+  // submissions and may introduce false barriers between dispatches.
+  // Use this to identify slow dispatches and command buffers and refine
+  // from there; be wary of whole-program tracing with this enabled.
+  int32_t stream_tracing;
 
   // Whether to use async allocations even if reported as available by the
   // device. Defaults to true when the device supports it.
diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
index a53f381..30cccca 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
@@ -277,7 +277,7 @@
   out_params->event_pool_capacity = 32;
   out_params->queue_count = 1;
   out_params->command_buffer_mode = IREE_HAL_CUDA_COMMAND_BUFFER_MODE_GRAPH;
-  out_params->stream_tracing = false;
+  out_params->stream_tracing = 0;
   out_params->async_allocations = true;
 }
 
@@ -346,9 +346,18 @@
 
   // Enable tracing for the (currently only) stream - no-op if disabled.
   if (iree_status_is_ok(status) && device->params.stream_tracing) {
+    if (device->params.stream_tracing >= IREE_HAL_CUDA_TRACING_VERBOSITY_MAX ||
+        device->params.stream_tracing < IREE_HAL_CUDA_TRACING_VERBOSITY_OFF) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "invalid stream_tracing argument: expected to be between %d and %d",
+          IREE_HAL_CUDA_TRACING_VERBOSITY_OFF,
+          IREE_HAL_CUDA_TRACING_VERBOSITY_MAX);
+    }
     status = iree_hal_cuda_tracing_context_allocate(
         device->cuda_symbols, device->identifier, dispatch_stream,
-        &device->block_pool, host_allocator, &device->tracing_context);
+        device->params.stream_tracing, &device->block_pool, host_allocator,
+        &device->tracing_context);
   }
 
   // Memory pool support is conditional.
diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
index 68d4d34..e5b88df 100644
--- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
@@ -82,9 +82,10 @@
 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
 static void iree_cuda_graph_command_buffer_trace_zone_begin_external(
-    iree_hal_cuda_graph_command_buffer_t* command_buffer, const char* file_name,
-    size_t file_name_length, uint32_t line, const char* function_name,
-    size_t function_name_length, const char* name, size_t name_length) {
+    iree_hal_cuda_graph_command_buffer_t* command_buffer, int32_t verbosity,
+    const char* file_name, size_t file_name_length, uint32_t line,
+    const char* function_name, size_t function_name_length, const char* name,
+    size_t name_length) {
   // Make sure there are no new nodes after the last barrier.
   // Work should start after the event.
   if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) {
@@ -97,7 +98,7 @@
   size_t dependency_count = command_buffer->cu_barrier_node ? 1 : 0;
   IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      tracing_event_node, command_buffer->cu_graph,
+      tracing_event_node, command_buffer->cu_graph, verbosity,
       &command_buffer->cu_barrier_node, dependency_count, file_name,
       file_name_length, line, function_name, function_name_length, name,
       name_length);
@@ -109,7 +110,7 @@
 }
 
 static void iree_cuda_graph_command_buffer_trace_zone_end(
-    iree_hal_cuda_graph_command_buffer_t* command_buffer) {
+    iree_hal_cuda_graph_command_buffer_t* command_buffer, int32_t verbosity) {
   // Make sure there are no new nodes after the last barrier.
   // Prior work should end before the tracing event is recorded.
   if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) {
@@ -124,7 +125,7 @@
                  "ending a zone should at least depend on the beginning");
   IREE_CUDA_GRAPH_TRACE_ZONE_END(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      tracing_event_node, command_buffer->cu_graph,
+      tracing_event_node, command_buffer->cu_graph, verbosity,
       &command_buffer->cu_barrier_node, dependency_count);
 
   // We need to wait on the tracing end before other work starts.
@@ -132,27 +133,29 @@
   command_buffer->cu_barrier_node = *tracing_event_node;
 }
 
-#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(       \
-    command_buffer, file_name, file_name_length, line, function_name,   \
-    function_name_length, name, name_length)                            \
-  iree_cuda_graph_command_buffer_trace_zone_begin_external(             \
-      command_buffer, file_name, file_name_length, line, function_name, \
-      function_name_length, name, name_length)
-#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer) \
+#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(   \
+    command_buffer, verbosity, file_name, file_name_length, line,   \
+    function_name, function_name_length, name, name_length)         \
+  iree_cuda_graph_command_buffer_trace_zone_begin_external(         \
+      command_buffer, verbosity, file_name, file_name_length, line, \
+      function_name, function_name_length, name, name_length)
+#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \
+                                                        verbosity)      \
   IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(             \
-      command_buffer, /*file_name=*/NULL, 0, /*line=*/0, __FUNCTION__,  \
-      strlen(__FUNCTION__), /*name=*/NULL, 0)
-#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer) \
-  iree_cuda_graph_command_buffer_trace_zone_end(command_buffer)
+      command_buffer, verbosity, /*file_name=*/NULL, 0, /*line=*/0,     \
+      __FUNCTION__, strlen(__FUNCTION__), /*name=*/NULL, 0)
+#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, \
+                                                      verbosity)      \
+  iree_cuda_graph_command_buffer_trace_zone_end(command_buffer, verbosity)
 
 #else  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
-#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(     \
-    command_buffer, file_name, file_name_length, line, function_name, \
-    function_name_length, name, name_length)
-#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer)
-#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer)
-
+#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \
+    command_buffer, verbosity, file_name, file_name_length, line, \
+    function_name, function_name_length, name, name_length)
+#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \
+                                                        verbosity)
+#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, verbosity)
 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
 iree_status_t iree_hal_cuda_graph_command_buffer_create(
@@ -335,7 +338,8 @@
       command_buffer->symbols,
       cuGraphCreate(&command_buffer->cu_graph, /*flags=*/0), "cuGraphCreate");
 
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE);
 
   return iree_ok_status();
 }
@@ -349,7 +353,8 @@
   IREE_RETURN_IF_ERROR(
       iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer));
 
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE);
 
   // Reset state used during recording.
   command_buffer->cu_barrier_node = NULL;
@@ -384,8 +389,9 @@
 
   (void)command_buffer;
   IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
-      command_buffer, location ? location->file.data : NULL,
-      location ? location->file.size : 0, location ? location->line : 0,
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE,
+      location ? location->file.data : NULL, location ? location->file.size : 0,
+      location ? location->line : 0,
       /*func_name=*/NULL, 0, label.data, label.size);
 }
 
@@ -394,7 +400,8 @@
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
   (void)command_buffer;
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE);
 }
 
 static iree_status_t
@@ -507,7 +514,8 @@
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer));
@@ -546,7 +554,8 @@
           dependency_count, &params, command_buffer->cu_context),
       "cuGraphAddMemsetNode");
 
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
@@ -557,7 +566,8 @@
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer));
@@ -608,7 +618,8 @@
           dependency_count, &params, command_buffer->cu_context),
       "cuGraphAddMemcpyNode");
 
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
@@ -619,7 +630,8 @@
   iree_hal_cuda_graph_command_buffer_t* command_buffer =
       iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer));
@@ -666,7 +678,8 @@
           dependency_count, &params, command_buffer->cu_context),
       "cuGraphAddMemcpyNode");
 
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
@@ -763,9 +776,10 @@
               executable, entry_point, &kernel_info));
 
   IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
-      command_buffer, kernel_info.source_filename.data,
-      kernel_info.source_filename.size, kernel_info.source_line,
-      kernel_info.function_name.data, kernel_info.function_name.size,
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE,
+      kernel_info.source_filename.data, kernel_info.source_filename.size,
+      kernel_info.source_line, kernel_info.function_name.data,
+      kernel_info.function_name.size,
       /*name=*/NULL, 0);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
@@ -865,7 +879,8 @@
           dependency_count, &params),
       "cuGraphAddKernelNode");
 
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
@@ -898,10 +913,10 @@
               executable, entry_point, &kernel_info));
 
   IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
-      command_buffer, kernel_info.source_filename.data,
-      kernel_info.source_filename.size, kernel_info.source_line,
-      kernel_info.function_name.data, kernel_info.function_name.size,
-      /*name=*/NULL, 0);
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE,
+      kernel_info.source_filename.data, kernel_info.source_filename.size,
+      kernel_info.source_line, kernel_info.function_name.data,
+      kernel_info.function_name.size, /*name=*/NULL, 0);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
@@ -990,7 +1005,8 @@
           dependency_count, &params),
       "cuGraphAddKernelNode");
 
-  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
diff --git a/runtime/src/iree/hal/drivers/cuda/nccl_channel.c b/runtime/src/iree/hal/drivers/cuda/nccl_channel.c
index e3eb31c..2f6eb3f 100644
--- a/runtime/src/iree/hal/drivers/cuda/nccl_channel.c
+++ b/runtime/src/iree/hal/drivers/cuda/nccl_channel.c
@@ -559,7 +559,8 @@
     iree_string_view_t collective_str =
         iree_hal_collective_op_format(&entry->op, &string_temp);
     IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
-        tracing_context, tracing_event_list, stream, __FILE__, strlen(__FILE__),
+        tracing_context, tracing_event_list, stream,
+        IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, __FILE__, strlen(__FILE__),
         (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__),
         collective_str.data, collective_str.size);
   }
@@ -578,8 +579,8 @@
   // End all zones we began above - note that these are just simply nested so
   // order doesn't matter so long as we end the right number of zones.
   for (iree_host_size_t i = 0; i < batch->count; ++i) {
-    IREE_CUDA_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list,
-                                    stream);
+    IREE_CUDA_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, stream,
+                                    IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
   }
 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
diff --git a/runtime/src/iree/hal/drivers/cuda/registration/driver_module.c b/runtime/src/iree/hal/drivers/cuda/registration/driver_module.c
index bea81bc..2e5bcff 100644
--- a/runtime/src/iree/hal/drivers/cuda/registration/driver_module.c
+++ b/runtime/src/iree/hal/drivers/cuda/registration/driver_module.c
@@ -26,10 +26,14 @@
     "Enables CUDA asynchronous stream-ordered allocations when supported.");
 
 IREE_FLAG(
-    bool, cuda_tracing, true,
-    "Enables tracing of stream events when Tracy instrumentation is enabled.\n"
-    "Severely impacts benchmark timings and should only be used when\n"
-    "analyzing dispatch timings.");
+    int32_t, cuda_tracing, 2,
+    "Controls the verbosity of tracing when Tracy instrumentation is enabled.\n"
+    "The impact to benchmark timing becomes more severe as the verbosity\n"
+    "increases, and thus should be only enabled when needed.\n"
+    "Permissible values are:\n"
+    "   0 : stream tracing disabled.\n"
+    "   1 : coarse command buffer level tracing enabled.\n"
+    "   2 : fine-grained kernel level tracing enabled.\n");
 
 IREE_FLAG(int32_t, cuda_default_index, 0,
           "Specifies the index of the default CUDA device to use");
diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
index a9b50fc..4b8a0b1 100644
--- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
@@ -182,7 +182,7 @@
 
   IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      command_buffer->cu_stream,
+      command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE,
       /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_cuda_stream_command_buffer",
       strlen("iree_hal_cuda_stream_command_buffer"), /*name=*/NULL, 0);
 
@@ -217,9 +217,9 @@
                                        command_buffer->resource_set,
                                        &command_buffer->collective_batch);
 
-  IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context,
-                                  &command_buffer->tracing_event_list,
-                                  command_buffer->cu_stream);
+  IREE_CUDA_STREAM_TRACE_ZONE_END(
+      command_buffer->tracing_context, &command_buffer->tracing_event_list,
+      command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE);
 
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
@@ -235,8 +235,9 @@
 
   IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      command_buffer->cu_stream, location ? location->file.data : NULL,
-      location ? location->file.size : 0, location ? location->line : 0,
+      command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE,
+      location ? location->file.data : NULL, location ? location->file.size : 0,
+      location ? location->line : 0,
       /*func_name=*/NULL, 0, label.data, label.size);
 
   // TODO: pass along to CUPTI if available.
@@ -250,9 +251,9 @@
 
   // TODO: pass along to CUPTI if available.
 
-  IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context,
-                                  &command_buffer->tracing_event_list,
-                                  command_buffer->cu_stream);
+  IREE_CUDA_STREAM_TRACE_ZONE_END(
+      command_buffer->tracing_context, &command_buffer->tracing_event_list,
+      command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE);
 }
 
 static iree_status_t iree_hal_cuda_stream_command_buffer_execution_barrier(
@@ -550,9 +551,10 @@
 
   IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      command_buffer->cu_stream, kernel_info.source_filename.data,
-      kernel_info.source_filename.size, kernel_info.source_line,
-      kernel_info.function_name.data, kernel_info.function_name.size,
+      command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE,
+      kernel_info.source_filename.data, kernel_info.source_filename.size,
+      kernel_info.source_line, kernel_info.function_name.data,
+      kernel_info.function_name.size,
       /*name=*/NULL, 0);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
@@ -634,9 +636,9 @@
                      params_ptr, NULL),
       "cuLaunchKernel");
 
-  IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context,
-                                  &command_buffer->tracing_event_list,
-                                  command_buffer->cu_stream);
+  IREE_CUDA_STREAM_TRACE_ZONE_END(
+      command_buffer->tracing_context, &command_buffer->tracing_event_list,
+      command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
 
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
@@ -672,10 +674,10 @@
 
   IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      command_buffer->cu_stream, kernel_info.source_filename.data,
-      kernel_info.source_filename.size, kernel_info.source_line,
-      kernel_info.function_name.data, kernel_info.function_name.size,
-      /*name=*/NULL, 0);
+      command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE,
+      kernel_info.source_filename.data, kernel_info.source_filename.size,
+      kernel_info.source_line, kernel_info.function_name.data,
+      kernel_info.function_name.size, /*name=*/NULL, 0);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
@@ -747,9 +749,9 @@
                      command_buffer->cu_stream, params_ptr, NULL),
       "cuLaunchKernel");
 
-  IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context,
-                                  &command_buffer->tracing_event_list,
-                                  command_buffer->cu_stream);
+  IREE_CUDA_STREAM_TRACE_ZONE_END(
+      command_buffer->tracing_context, &command_buffer->tracing_event_list,
+      command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
 
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
diff --git a/runtime/src/iree/hal/drivers/cuda/tracing.c b/runtime/src/iree/hal/drivers/cuda/tracing.c
index 913ca5d..057fdda 100644
--- a/runtime/src/iree/hal/drivers/cuda/tracing.c
+++ b/runtime/src/iree/hal/drivers/cuda/tracing.c
@@ -69,6 +69,8 @@
 
   uint32_t query_capacity;
 
+  iree_hal_cuda_tracing_verbosity_t verbosity;
+
   // Event pool reused to capture tracing timestamps.
   // The lifetime of the events are as follows.
   // 1) All events are allocated when the tracing context is created.
@@ -118,6 +120,7 @@
 iree_status_t iree_hal_cuda_tracing_context_allocate(
     const iree_hal_cuda_dynamic_symbols_t* symbols,
     iree_string_view_t queue_name, CUstream stream,
+    iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_cuda_tracing_context_t** out_context) {
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -138,6 +141,7 @@
     context->query_capacity = IREE_ARRAYSIZE(context->event_pool);
     context->submitted_event_list.head = NULL;
     context->submitted_event_list.tail = NULL;
+    context->verbosity = stream_tracing_verbosity;
     iree_slim_mutex_initialize(&context->event_mutex);
   }
 
@@ -364,7 +368,8 @@
 // event.
 static uint16_t iree_hal_cuda_stream_tracing_context_insert_query(
     iree_hal_cuda_tracing_context_t* context,
-    iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream) {
+    iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
+    iree_hal_cuda_tracing_verbosity_t verbosity) {
   iree_slim_mutex_lock(&context->event_mutex);
   IREE_ASSERT_ARGUMENT(event_list);
 
@@ -392,7 +397,8 @@
 static uint16_t iree_hal_cuda_graph_tracing_context_insert_query(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list,
-    CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes,
+    CUgraphNode* out_node, CUgraph graph,
+    iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
     size_t dependency_nodes_count) {
   IREE_ASSERT_ARGUMENT(event_list);
   iree_slim_mutex_lock(&context->event_mutex);
@@ -426,22 +432,26 @@
 void iree_hal_cuda_stream_tracing_zone_begin_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
+    iree_hal_cuda_tracing_verbosity_t verbosity,
     const iree_tracing_location_t* src_loc) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
+
   uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query(
-      context, event_list, stream);
+      context, event_list, stream, verbosity);
   iree_tracing_gpu_zone_begin(context->id, query_id, src_loc);
 }
 
 void iree_hal_cuda_stream_tracing_zone_begin_external_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
-    const char* file_name, size_t file_name_length, uint32_t line,
-    const char* function_name, size_t function_name_length, const char* name,
-    size_t name_length) {
+    iree_hal_cuda_tracing_verbosity_t verbosity, const char* file_name,
+    size_t file_name_length, uint32_t line, const char* function_name,
+    size_t function_name_length, const char* name, size_t name_length) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query(
-      context, event_list, stream);
+      context, event_list, stream, verbosity);
   iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
                                        file_name_length, line, function_name,
                                        function_name_length, name, name_length);
@@ -450,13 +460,15 @@
 void iree_hal_cuda_graph_tracing_zone_begin_external_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list,
-    CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes,
+    CUgraphNode* out_node, CUgraph graph,
+    iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
     size_t dependency_nodes_count, const char* file_name,
     size_t file_name_length, uint32_t line, const char* function_name,
     size_t function_name_length, const char* name, size_t name_length) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query(
-      context, event_list, out_node, graph, dependency_nodes,
+      context, event_list, out_node, graph, verbosity, dependency_nodes,
       dependency_nodes_count);
   iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
                                        file_name_length, line, function_name,
@@ -465,21 +477,25 @@
 
 void iree_hal_cuda_stream_tracing_zone_end_impl(
     iree_hal_cuda_tracing_context_t* context,
-    iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream) {
+    iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
+    iree_hal_cuda_tracing_verbosity_t verbosity) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query(
-      context, event_list, stream);
+      context, event_list, stream, verbosity);
   iree_tracing_gpu_zone_end(context->id, query_id);
 }
 
 void iree_hal_cuda_graph_tracing_zone_end_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list,
-    CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes,
+    CUgraphNode* out_node, CUgraph graph,
+    iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
     size_t dependency_nodes_count) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query(
-      context, event_list, out_node, graph, dependency_nodes,
+      context, event_list, out_node, graph, verbosity, dependency_nodes,
       dependency_nodes_count);
   iree_tracing_gpu_zone_end(context->id, query_id);
 }
@@ -489,6 +505,7 @@
 iree_status_t iree_hal_cuda_tracing_context_allocate(
     const iree_hal_cuda_dynamic_symbols_t* symbols,
     iree_string_view_t queue_name, CUstream stream,
+    iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_cuda_tracing_context_t** out_context) {
   *out_context = NULL;
diff --git a/runtime/src/iree/hal/drivers/cuda/tracing.h b/runtime/src/iree/hal/drivers/cuda/tracing.h
index abe468f..1174f77 100644
--- a/runtime/src/iree/hal/drivers/cuda/tracing.h
+++ b/runtime/src/iree/hal/drivers/cuda/tracing.h
@@ -52,11 +52,19 @@
   iree_hal_cuda_tracing_context_event_t* tail;
 } iree_hal_cuda_tracing_context_event_list_t;
 
+typedef enum iree_hal_cuda_tracing_verbosity_e {
+  IREE_HAL_CUDA_TRACING_VERBOSITY_OFF = 0,
+  IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE,
+  IREE_HAL_CUDA_TRACING_VERBOSITY_FINE,
+  IREE_HAL_CUDA_TRACING_VERBOSITY_MAX
+} iree_hal_cuda_tracing_verbosity_t;
+
 // Allocates a tracing context for the given CUDA |stream|.
 // Each context must only be used with the stream it was created for.
 iree_status_t iree_hal_cuda_tracing_context_allocate(
     const iree_hal_cuda_dynamic_symbols_t* symbols,
     iree_string_view_t queue_name, CUstream stream,
+    iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_cuda_tracing_context_t** out_context);
 
@@ -88,6 +96,7 @@
 void iree_hal_cuda_stream_tracing_zone_begin_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
+    iree_hal_cuda_tracing_verbosity_t verbosity,
     const iree_tracing_location_t* src_loc);
 
 // Begins an external zone using the given source information.
@@ -95,74 +104,80 @@
 void iree_hal_cuda_stream_tracing_zone_begin_external_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
-    const char* file_name, size_t file_name_length, uint32_t line,
-    const char* function_name, size_t function_name_length, const char* name,
-    size_t name_length);
+    iree_hal_cuda_tracing_verbosity_t verbosity, const char* file_name,
+    size_t file_name_length, uint32_t line, const char* function_name,
+    size_t function_name_length, const char* name, size_t name_length);
 
 void iree_hal_cuda_graph_tracing_zone_begin_external_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list,
-    CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes,
+    CUgraphNode* out_node, CUgraph graph,
+    iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
     size_t dependency_nodes_count, const char* file_name,
     size_t file_name_length, uint32_t line, const char* function_name,
     size_t function_name_length, const char* name, size_t name_length);
 
 void iree_hal_cuda_stream_tracing_zone_end_impl(
     iree_hal_cuda_tracing_context_t* context,
-    iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream);
+    iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
+    iree_hal_cuda_tracing_verbosity_t verbosity);
 void iree_hal_cuda_graph_tracing_zone_end_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list,
-    CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes,
+    CUgraphNode* out_node, CUgraph graph,
+    iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
     size_t dependency_nodes_count);
 
 // Begins a new zone with the parent function name.
-#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list_begin,      \
-                                          event_list_end, stream)         \
-  static const iree_tracing_location_t TracyConcat(                       \
-      __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \
-                                            (uint32_t)__LINE__, 0};       \
-  iree_hal_cuda_stream_tracing_zone_begin_impl(                           \
-      context, event_list_begin, event_list_end, stream,                  \
+#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list_begin,         \
+                                          event_list_end, stream, verbosity) \
+  static const iree_tracing_location_t TracyConcat(                          \
+      __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__,    \
+                                            (uint32_t)__LINE__, 0};          \
+  iree_hal_cuda_stream_tracing_zone_begin_impl(                              \
+      context, event_list_begin, event_list_end, stream, verbosity,          \
       &TracyConcat(__tracy_source_location, __LINE__));
 
 // Begins an externally defined zone with a dynamic source location.
 // The |file_name|, |function_name|, and optional |name| strings will be copied
 // into the trace buffer and do not need to persist.
-#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(                   \
-    context, event_list, stream, file_name, file_name_length, line,   \
-    function_name, function_name_length, name, name_length)           \
-  iree_hal_cuda_stream_tracing_zone_begin_external_impl(              \
-      context, event_list, stream, file_name, file_name_length, line, \
-      function_name, function_name_length, name, name_length)
+#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(                            \
+    context, event_list, stream, verbosity, file_name, file_name_length, line, \
+    function_name, function_name_length, name, name_length)                    \
+  iree_hal_cuda_stream_tracing_zone_begin_external_impl(                       \
+      context, event_list, stream, verbosity, file_name, file_name_length,     \
+      line, function_name, function_name_length, name, name_length)
 #define IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL(                            \
-    context, event_list, out_node, graph, dependency_nodes,                   \
+    context, event_list, out_node, graph, verbosity, dependency_nodes,        \
     dependency_nodes_count, file_name, file_name_length, line, function_name, \
     function_name_length, name, name_length)                                  \
   iree_hal_cuda_graph_tracing_zone_begin_external_impl(                       \
-      context, event_list, out_node, graph, dependency_nodes,                 \
+      context, event_list, out_node, graph, verbosity, dependency_nodes,      \
       dependency_nodes_count, file_name, file_name_length, line,              \
       function_name, function_name_length, name, name_length)
 
-#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream) \
-  iree_hal_cuda_stream_tracing_zone_end_impl(context, event_list, stream)
+#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream,      \
+                                        verbosity)                        \
+  iree_hal_cuda_stream_tracing_zone_end_impl(context, event_list, stream, \
+                                             verbosity)
 #define IREE_CUDA_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \
-                                       dependency_nodes,                     \
+                                       verbosity, dependency_nodes,          \
                                        dependency_nodes_count)               \
-  iree_hal_cuda_graph_tracing_zone_end_impl(context, event_list, out_node,   \
-                                            graph, dependency_nodes,         \
-                                            dependency_nodes_count)
+  iree_hal_cuda_graph_tracing_zone_end_impl(                                 \
+      context, event_list, out_node, graph, verbosity, dependency_nodes,     \
+      dependency_nodes_count)
 #else
 
-#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream)
-#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(                 \
-    context, event_list, stream, file_name, file_name_length, line, \
+#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream, \
+                                          verbosity)
+#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(                            \
+    context, event_list, stream, verbosity, file_name, file_name_length, line, \
     function_name, function_name_length, name, name_length)
 #define IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL(                            \
-    context, event_list, out_node, graph, dependency_nodes,                   \
+    context, event_list, out_node, graph, verbosity, dependency_nodes,        \
     dependency_nodes_count, file_name, file_name_length, line, function_name, \
     function_name_length, name, name_length)
-#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream)
+#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream, verbosity)
 
 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
diff --git a/runtime/src/iree/hal/drivers/hip/api.h b/runtime/src/iree/hal/drivers/hip/api.h
index a505535..2218b19 100644
--- a/runtime/src/iree/hal/drivers/hip/api.h
+++ b/runtime/src/iree/hal/drivers/hip/api.h
@@ -76,15 +76,16 @@
   // Specifies how command buffers are recorded and executed.
   iree_hal_hip_command_buffer_mode_t command_buffer_mode;
 
-  // Enables tracing of command buffers when IREE tracing is enabled.
+  // Controls the verbosity of command buffers tracing when when IREE
+  // tracing is enabled.
   // May take advantage of additional extensions for more accurate timing or
   // hardware-specific performance counters.
   //
   // NOTE: tracing has a non-trivial overhead and will skew the timing of
-  // submissions and introduce false barriers between dispatches. Use this to
-  // identify slow dispatches and refine from there; be wary of whole-program
-  // tracing with this enabled.
-  bool stream_tracing;
+  // submissions and may introduce false barriers between dispatches.
+  // Use this to identify slow dispatches and command buffers and refine
+  // from there; be wary of whole-program tracing with this enabled.
+  int32_t stream_tracing;
 
   // Whether to use async allocations even if reported as available by the
   // device. Defaults to true when the device supports it.
diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
index 99b3538..afade26 100644
--- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
@@ -83,7 +83,8 @@
 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
 static void iree_hip_graph_command_buffer_trace_zone_begin_external(
-    iree_hal_hip_graph_command_buffer_t* command_buffer, const char* file_name,
+    iree_hal_hip_graph_command_buffer_t* command_buffer,
+    iree_hal_hip_tracing_verbosity_t verbosity, const char* file_name,
     size_t file_name_length, uint32_t line, const char* function_name,
     size_t function_name_length, const char* name, size_t name_length) {
   // Make sure there are no new nodes after the last barrier.
@@ -98,7 +99,7 @@
   size_t dependency_count = command_buffer->hip_barrier_node ? 1 : 0;
   IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      tracing_event_node, command_buffer->hip_graph,
+      tracing_event_node, command_buffer->hip_graph, verbosity,
       &command_buffer->hip_barrier_node, dependency_count, file_name,
       file_name_length, line, function_name, function_name_length, name,
       name_length);
@@ -110,7 +111,8 @@
 }
 
 static void iree_hip_graph_command_buffer_trace_zone_end(
-    iree_hal_hip_graph_command_buffer_t* command_buffer) {
+    iree_hal_hip_graph_command_buffer_t* command_buffer,
+    iree_hal_hip_tracing_verbosity_t verbosity) {
   // Make sure there are no new nodes after the last barrier.
   // Prior work should end before the tracing event is recorded.
   if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) {
@@ -125,7 +127,7 @@
                  "ending a zone should at least depend on the beginning");
   IREE_HIP_GRAPH_TRACE_ZONE_END(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      tracing_event_node, command_buffer->hip_graph,
+      tracing_event_node, command_buffer->hip_graph, verbosity,
       &command_buffer->hip_barrier_node, dependency_count);
 
   // We need to wait on the tracing end before other work starts.
@@ -133,26 +135,29 @@
   command_buffer->hip_barrier_node = *tracing_event_node;
 }
 
-#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(        \
-    command_buffer, file_name, file_name_length, line, function_name,   \
-    function_name_length, name, name_length)                            \
-  iree_hip_graph_command_buffer_trace_zone_begin_external(              \
-      command_buffer, file_name, file_name_length, line, function_name, \
-      function_name_length, name, name_length)
-#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer) \
+#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(    \
+    command_buffer, verbosity, file_name, file_name_length, line,   \
+    function_name, function_name_length, name, name_length)         \
+  iree_hip_graph_command_buffer_trace_zone_begin_external(          \
+      command_buffer, verbosity, file_name, file_name_length, line, \
+      function_name, function_name_length, name, name_length)
+#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \
+                                                       verbosity)      \
   IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(             \
-      command_buffer, /*file_name=*/NULL, 0, /*line=*/0, __FUNCTION__, \
-      strlen(__FUNCTION__), /*name=*/NULL, 0)
-#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer) \
-  iree_hip_graph_command_buffer_trace_zone_end(command_buffer)
+      command_buffer, verbosity, /*file_name=*/NULL, 0, /*line=*/0,    \
+      __FUNCTION__, strlen(__FUNCTION__), /*name=*/NULL, 0)
+#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, \
+                                                     verbosity)      \
+  iree_hip_graph_command_buffer_trace_zone_end(command_buffer, verbosity)
 
 #else  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
-#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(      \
-    command_buffer, file_name, file_name_length, line, function_name, \
-    function_name_length, name, name_length)
-#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer)
-#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer)
+#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(  \
+    command_buffer, verbosity, file_name, file_name_length, line, \
+    function_name, function_name_length, name, name_length)
+#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \
+                                                       verbosity)
+#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, verbosity)
 
 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
 
@@ -340,7 +345,7 @@
       "hipGraphCreate");
 
   IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
-      command_buffer,
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE,
       /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_graph_command_buffer",
       strlen("iree_hal_hip_graph_command_buffer"),
       /*name=*/NULL, 0);
@@ -357,7 +362,8 @@
   IREE_RETURN_IF_ERROR(
       iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer));
 
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE);
 
   // Reset state used during recording.
   command_buffer->hip_barrier_node = NULL;
@@ -392,8 +398,9 @@
 
   (void)command_buffer;
   IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
-      command_buffer, location ? location->file.data : NULL,
-      location ? location->file.size : 0, location ? location->line : 0,
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE,
+      location ? location->file.data : NULL, location ? location->file.size : 0,
+      location ? location->line : 0,
       /*func_name=*/NULL, 0, label.data, label.size);
 }
 
@@ -402,7 +409,8 @@
   iree_hal_hip_graph_command_buffer_t* command_buffer =
       iree_hal_hip_graph_command_buffer_cast(base_command_buffer);
   (void)command_buffer;
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE);
 }
 
 static iree_status_t
@@ -515,7 +523,8 @@
   iree_hal_hip_graph_command_buffer_t* command_buffer =
       iree_hal_hip_graph_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer));
@@ -553,7 +562,8 @@
           dependency_count, &params),
       "hipGraphAddMemsetNode");
 
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
@@ -569,7 +579,8 @@
                             "cannot use graph-based command buffer");
   }
   IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer));
@@ -621,7 +632,8 @@
           dependency_count, &params, command_buffer->hip_context),
       "hipDrvGraphAddMemcpyNode");
 
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
@@ -637,7 +649,8 @@
                             "cannot use graph-based command buffer");
   }
   IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer));
@@ -683,7 +696,8 @@
           dependency_count, &params, command_buffer->hip_context),
       "hipDrvGraphAddMemcpyNode");
 
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
@@ -787,9 +801,10 @@
               executable, entry_point, &kernel_info));
 
   IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
-      command_buffer, kernel_info.source_filename.data,
-      kernel_info.source_filename.size, kernel_info.source_line,
-      kernel_info.function_name.data, kernel_info.function_name.size,
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE,
+      kernel_info.source_filename.data, kernel_info.source_filename.size,
+      kernel_info.source_line, kernel_info.function_name.data,
+      kernel_info.function_name.size,
       /*name=*/NULL, 0);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
@@ -874,7 +889,8 @@
           dependency_count, &params),
       "hipGraphAddKernelNode");
 
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
@@ -907,10 +923,10 @@
               executable, entry_point, &kernel_info));
 
   IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
-      command_buffer, kernel_info.source_filename.data,
-      kernel_info.source_filename.size, kernel_info.source_line,
-      kernel_info.function_name.data, kernel_info.function_name.size,
-      /*name=*/NULL, 0);
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE,
+      kernel_info.source_filename.data, kernel_info.source_filename.size,
+      kernel_info.source_line, kernel_info.function_name.data,
+      kernel_info.function_name.size, /*name=*/NULL, 0);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
@@ -990,7 +1006,8 @@
           dependency_count, &params),
       "hipGraphAddKernelNode");
 
-  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
+  IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
+      command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index 133d3f5..f92c784 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -275,7 +275,7 @@
   out_params->event_pool_capacity = 32;
   out_params->queue_count = 1;
   out_params->command_buffer_mode = IREE_HAL_HIP_COMMAND_BUFFER_MODE_STREAM;
-  out_params->stream_tracing = false;
+  out_params->stream_tracing = 0;
   out_params->async_allocations = true;
   out_params->allow_inline_execution = false;
 }
@@ -344,9 +344,18 @@
 
   // Enable tracing for the (currently only) stream - no-op if disabled.
   if (iree_status_is_ok(status) && device->params.stream_tracing) {
+    if (device->params.stream_tracing >= IREE_HAL_HIP_TRACING_VERBOSITY_MAX ||
+        device->params.stream_tracing < IREE_HAL_HIP_TRACING_VERBOSITY_OFF) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "invalid stream_tracing argument: expected to be between %d and %d",
+          IREE_HAL_HIP_TRACING_VERBOSITY_OFF,
+          IREE_HAL_HIP_TRACING_VERBOSITY_MAX);
+    }
     status = iree_hal_hip_tracing_context_allocate(
         device->hip_symbols, device->identifier, dispatch_stream,
-        &device->block_pool, host_allocator, &device->tracing_context);
+        device->params.stream_tracing, &device->block_pool, host_allocator,
+        &device->tracing_context);
   }
 
   // Memory pool support is conditional.
diff --git a/runtime/src/iree/hal/drivers/hip/rccl_channel.c b/runtime/src/iree/hal/drivers/hip/rccl_channel.c
index e3c38a2..84e592c 100644
--- a/runtime/src/iree/hal/drivers/hip/rccl_channel.c
+++ b/runtime/src/iree/hal/drivers/hip/rccl_channel.c
@@ -593,7 +593,8 @@
     iree_string_view_t collective_str =
         iree_hal_collective_op_format(&entry->op, &string_temp);
     IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
-        tracing_context, tracing_event_list, stream, __FILE__, strlen(__FILE__),
+        tracing_context, tracing_event_list, stream,
+        IREE_HAL_HIP_TRACING_VERBOSITY_FINE, __FILE__, strlen(__FILE__),
         (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__),
         collective_str.data, collective_str.size);
   }
@@ -613,7 +614,8 @@
   IREE_TRACE({
     for (iree_host_size_t i = 0; i < batch->count; ++i) {
       IREE_HIP_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list,
-                                     stream);
+                                     stream,
+                                     IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
     }
   });
 
diff --git a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c
index cabb1da..1998cfc 100644
--- a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c
+++ b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c
@@ -36,10 +36,14 @@
     "Enables HIP asynchronous stream-ordered allocations when supported.");
 
 IREE_FLAG(
-    bool, hip_tracing, true,
-    "Enables tracing of stream events when Tracy instrumentation is enabled.\n"
-    "Severely impacts benchmark timings and should only be used when\n"
-    "analyzing dispatch timings.");
+    int32_t, hip_tracing, 2,
+    "Controls the verbosity of tracing when Tracy instrumentation is enabled.\n"
+    "The impact to benchmark timing becomes more severe as the verbosity\n"
+    "increases, and thus should be only enabled when needed.\n"
+    "Permissible values are:\n"
+    "   0 : stream tracing disabled.\n"
+    "   1 : coarse command buffer level tracing enabled.\n"
+    "   2 : fine-grained kernel level tracing enabled.\n");
 
 IREE_FLAG(int32_t, hip_default_index, 0,
           "Specifies the index of the default HIP device to use");
@@ -181,7 +185,7 @@
             "Option 'hip_tracing' expected to be int. Got: '%.*s'",
             (int)value.size, value.data);
       }
-      device_params->stream_tracing = ivalue ? true : false;
+      device_params->stream_tracing = ivalue;
     } else if (iree_string_view_equal(key, key_hip_default_index)) {
       if (!iree_string_view_atoi_int32(value, &ivalue)) {
         return iree_make_status(
diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
index e4ffac2..1b8b6b6 100644
--- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
@@ -183,7 +183,7 @@
 
   IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      command_buffer->hip_stream,
+      command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE,
       /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_stream_command_buffer",
       strlen("iree_hal_hip_stream_command_buffer"),
       /*name=*/NULL, 0);
@@ -212,9 +212,9 @@
       z0, iree_hal_resource_set_allocate(command_buffer->arena.block_pool,
                                          &command_buffer->resource_set));
 
-  IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context,
-                                 &command_buffer->tracing_event_list,
-                                 command_buffer->hip_stream);
+  IREE_HIP_STREAM_TRACE_ZONE_END(
+      command_buffer->tracing_context, &command_buffer->tracing_event_list,
+      command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE);
 
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
@@ -230,8 +230,9 @@
 
   IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      command_buffer->hip_stream, location ? location->file.data : NULL,
-      location ? location->file.size : 0, location ? location->line : 0,
+      command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE,
+      location ? location->file.data : NULL, location ? location->file.size : 0,
+      location ? location->line : 0,
       /*func_name=*/NULL, 0, label.data, label.size);
 }
 
@@ -241,9 +242,9 @@
       iree_hal_hip_stream_command_buffer_cast(base_command_buffer);
   (void)command_buffer;
 
-  IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context,
-                                 &command_buffer->tracing_event_list,
-                                 command_buffer->hip_stream);
+  IREE_HIP_STREAM_TRACE_ZONE_END(
+      command_buffer->tracing_context, &command_buffer->tracing_event_list,
+      command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE);
 }
 
 static iree_status_t iree_hal_hip_stream_command_buffer_execution_barrier(
@@ -542,9 +543,10 @@
 
   IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      command_buffer->hip_stream, kernel_info.source_filename.data,
-      kernel_info.source_filename.size, kernel_info.source_line,
-      kernel_info.function_name.data, kernel_info.function_name.size,
+      command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE,
+      kernel_info.source_filename.data, kernel_info.source_filename.size,
+      kernel_info.source_line, kernel_info.function_name.data,
+      kernel_info.function_name.size,
       /*name=*/NULL, 0);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
@@ -615,9 +617,9 @@
           command_buffer->hip_stream, params_ptr, NULL),
       "hipModuleLaunchKernel");
 
-  IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context,
-                                 &command_buffer->tracing_event_list,
-                                 command_buffer->hip_stream);
+  IREE_HIP_STREAM_TRACE_ZONE_END(
+      command_buffer->tracing_context, &command_buffer->tracing_event_list,
+      command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
@@ -652,10 +654,10 @@
 
   IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
       command_buffer->tracing_context, &command_buffer->tracing_event_list,
-      command_buffer->hip_stream, kernel_info.source_filename.data,
-      kernel_info.source_filename.size, kernel_info.source_line,
-      kernel_info.function_name.data, kernel_info.function_name.size,
-      /*name=*/NULL, 0);
+      command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE,
+      kernel_info.source_filename.data, kernel_info.source_filename.size,
+      kernel_info.source_line, kernel_info.function_name.data,
+      kernel_info.function_name.size, /*name=*/NULL, 0);
 
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
@@ -718,9 +720,9 @@
           params_ptr, NULL),
       "hipModuleLaunchKernel");
 
-  IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context,
-                                 &command_buffer->tracing_event_list,
-                                 command_buffer->hip_stream);
+  IREE_HIP_STREAM_TRACE_ZONE_END(
+      command_buffer->tracing_context, &command_buffer->tracing_event_list,
+      command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
diff --git a/runtime/src/iree/hal/drivers/hip/tracing.c b/runtime/src/iree/hal/drivers/hip/tracing.c
index f1a7007..62b15ef 100644
--- a/runtime/src/iree/hal/drivers/hip/tracing.c
+++ b/runtime/src/iree/hal/drivers/hip/tracing.c
@@ -67,6 +67,8 @@
   // Submitted events
   iree_hal_hip_tracing_context_event_list_t submitted_event_list;
 
+  int32_t verbosity;
+
   uint32_t query_capacity;
 
   // Event pool reused to capture tracing timestamps.
@@ -119,6 +121,7 @@
 iree_status_t iree_hal_hip_tracing_context_allocate(
     const iree_hal_hip_dynamic_symbols_t* symbols,
     iree_string_view_t queue_name, hipStream_t stream,
+    iree_hal_hip_tracing_verbosity_t stream_tracing_verbosity,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_hip_tracing_context_t** out_context) {
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -139,6 +142,7 @@
     context->query_capacity = IREE_ARRAYSIZE(context->event_pool);
     context->submitted_event_list.head = NULL;
     context->submitted_event_list.tail = NULL;
+    context->verbosity = stream_tracing_verbosity;
     iree_slim_mutex_initialize(&context->event_mutex);
   }
 
@@ -425,8 +429,10 @@
 void iree_hal_hip_stream_tracing_zone_begin_impl(
     iree_hal_hip_tracing_context_t* context,
     iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream,
+    iree_hal_hip_tracing_verbosity_t verbosity,
     const iree_tracing_location_t* src_loc) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query(
       context, event_list, stream);
   iree_tracing_gpu_zone_begin(context->id, query_id, src_loc);
@@ -435,10 +441,11 @@
 void iree_hal_hip_stream_tracing_zone_begin_external_impl(
     iree_hal_hip_tracing_context_t* context,
     iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream,
-    const char* file_name, size_t file_name_length, uint32_t line,
-    const char* function_name, size_t function_name_length, const char* name,
-    size_t name_length) {
+    iree_hal_hip_tracing_verbosity_t verbosity, const char* file_name,
+    size_t file_name_length, uint32_t line, const char* function_name,
+    size_t function_name_length, const char* name, size_t name_length) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query(
       context, event_list, stream);
   iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
@@ -450,11 +457,13 @@
     iree_hal_hip_tracing_context_t* context,
     iree_hal_hip_tracing_context_event_list_t* event_list,
     hipGraphNode_t* out_node, hipGraph_t graph,
+    iree_hal_hip_tracing_verbosity_t verbosity,
     hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count,
     const char* file_name, size_t file_name_length, uint32_t line,
     const char* function_name, size_t function_name_length, const char* name,
     size_t name_length) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_hip_graph_tracing_context_insert_query(
       context, event_list, out_node, graph, dependency_nodes,
       dependency_nodes_count);
@@ -465,8 +474,10 @@
 
 void iree_hal_hip_stream_tracing_zone_end_impl(
     iree_hal_hip_tracing_context_t* context,
-    iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream) {
+    iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream,
+    iree_hal_hip_tracing_verbosity_t verbosity) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query(
       context, event_list, stream);
   iree_tracing_gpu_zone_end(context->id, query_id);
@@ -476,8 +487,10 @@
     iree_hal_hip_tracing_context_t* context,
     iree_hal_hip_tracing_context_event_list_t* event_list,
     hipGraphNode_t* out_node, hipGraph_t graph,
+    iree_hal_hip_tracing_verbosity_t verbosity,
     hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count) {
   if (!context) return;
+  if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_hip_graph_tracing_context_insert_query(
       context, event_list, out_node, graph, dependency_nodes,
       dependency_nodes_count);
@@ -489,6 +502,7 @@
 iree_status_t iree_hal_hip_tracing_context_allocate(
     const iree_hal_hip_dynamic_symbols_t* symbols,
     iree_string_view_t queue_name, hipStream_t stream,
+    iree_hal_hip_tracing_verbosity_t stream_tracing_verbosity,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_hip_tracing_context_t** out_context) {
   *out_context = NULL;
diff --git a/runtime/src/iree/hal/drivers/hip/tracing.h b/runtime/src/iree/hal/drivers/hip/tracing.h
index 24e12b8..8323fd7 100644
--- a/runtime/src/iree/hal/drivers/hip/tracing.h
+++ b/runtime/src/iree/hal/drivers/hip/tracing.h
@@ -52,11 +52,19 @@
   iree_hal_hip_tracing_context_event_t* tail;
 } iree_hal_hip_tracing_context_event_list_t;
 
+typedef enum iree_hal_hip_tracing_verbosity_e {
+  IREE_HAL_HIP_TRACING_VERBOSITY_OFF = 0,
+  IREE_HAL_HIP_TRACING_VERBOSITY_COARSE,
+  IREE_HAL_HIP_TRACING_VERBOSITY_FINE,
+  IREE_HAL_HIP_TRACING_VERBOSITY_MAX
+} iree_hal_hip_tracing_verbosity_t;
+
 // Allocates a tracing context for the given HIP |stream|.
 // Each context must only be used with the stream it was created for.
 iree_status_t iree_hal_hip_tracing_context_allocate(
     const iree_hal_hip_dynamic_symbols_t* symbols,
     iree_string_view_t queue_name, hipStream_t stream,
+    iree_hal_hip_tracing_verbosity_t stream_tracing_verbosity,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_hip_tracing_context_t** out_context);
 
@@ -87,6 +95,7 @@
 void iree_hal_hip_stream_tracing_zone_begin_impl(
     iree_hal_hip_tracing_context_t* context,
     iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream,
+    iree_hal_hip_tracing_verbosity_t verbosity,
     const iree_tracing_location_t* src_loc);
 
 // Begins an external zone using the given source information.
@@ -94,14 +103,15 @@
 void iree_hal_hip_stream_tracing_zone_begin_external_impl(
     iree_hal_hip_tracing_context_t* context,
     iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream,
-    const char* file_name, size_t file_name_length, uint32_t line,
-    const char* function_name, size_t function_name_length, const char* name,
-    size_t name_length);
+    iree_hal_hip_tracing_verbosity_t verbosity, const char* file_name,
+    size_t file_name_length, uint32_t line, const char* function_name,
+    size_t function_name_length, const char* name, size_t name_length);
 
 void iree_hal_hip_graph_tracing_zone_begin_external_impl(
     iree_hal_hip_tracing_context_t* context,
     iree_hal_hip_tracing_context_event_list_t* event_list,
     hipGraphNode_t* out_node, hipGraph_t graph,
+    iree_hal_hip_tracing_verbosity_t verbosity,
     hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count,
     const char* file_name, size_t file_name_length, uint32_t line,
     const char* function_name, size_t function_name_length, const char* name,
@@ -109,61 +119,67 @@
 
 void iree_hal_hip_stream_tracing_zone_end_impl(
     iree_hal_hip_tracing_context_t* context,
-    iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream);
+    iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream,
+    iree_hal_hip_tracing_verbosity_t verbosity);
 void iree_hal_hip_graph_tracing_zone_end_impl(
     iree_hal_hip_tracing_context_t* context,
     iree_hal_hip_tracing_context_event_list_t* event_list,
     hipGraphNode_t* out_node, hipGraph_t graph,
+    iree_hal_hip_tracing_verbosity_t verbosity,
     hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count);
 
 // Begins a new zone with the parent function name.
-#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream)     \
+#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream,     \
+                                         verbosity)                       \
   static const iree_tracing_location_t TracyConcat(                       \
       __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \
                                             (uint32_t)__LINE__, 0};       \
   iree_hal_hip_stream_tracing_zone_begin_impl(                            \
-      context, event_list, stream,                                        \
+      context, event_list, stream, verbosity,                             \
       &TracyConcat(__tracy_source_location, __LINE__));
 
 // Begins an externally defined zone with a dynamic source location.
 // The |file_name|, |function_name|, and optional |name| strings will be copied
 // into the trace buffer and do not need to persist.
-#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(                    \
-    context, event_list, stream, file_name, file_name_length, line,   \
-    function_name, function_name_length, name, name_length)           \
-  iree_hal_hip_stream_tracing_zone_begin_external_impl(               \
-      context, event_list, stream, file_name, file_name_length, line, \
-      function_name, function_name_length, name, name_length)
+#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(                             \
+    context, event_list, stream, verbosity, file_name, file_name_length, line, \
+    function_name, function_name_length, name, name_length)                    \
+  iree_hal_hip_stream_tracing_zone_begin_external_impl(                        \
+      context, event_list, stream, verbosity, file_name, file_name_length,     \
+      line, function_name, function_name_length, name, name_length)
+
 #define IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL(                             \
-    context, event_list, out_node, graph, dependency_nodes,                   \
+    context, event_list, out_node, graph, verbosity, dependency_nodes,        \
     dependency_nodes_count, file_name, file_name_length, line, function_name, \
     function_name_length, name, name_length)                                  \
   iree_hal_hip_graph_tracing_zone_begin_external_impl(                        \
-      context, event_list, out_node, graph, dependency_nodes,                 \
+      context, event_list, out_node, graph, verbosity, dependency_nodes,      \
       dependency_nodes_count, file_name, file_name_length, line,              \
       function_name, function_name_length, name, name_length)
 
-#define IREE_HIP_STREAM_TRACE_ZONE_END(context, event_list, stream) \
-  iree_hal_hip_stream_tracing_zone_end_impl(context, event_list, stream)
-#define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \
-                                      dependency_nodes,                     \
-                                      dependency_nodes_count)               \
-  iree_hal_hip_graph_tracing_zone_end_impl(context, event_list, out_node,   \
-                                           graph, dependency_nodes,         \
+#define IREE_HIP_STREAM_TRACE_ZONE_END(context, event_list, stream, verbosity) \
+  iree_hal_hip_stream_tracing_zone_end_impl(context, event_list, stream,       \
+                                            verbosity)
+
+#define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph,    \
+                                      verbosity, dependency_nodes,             \
+                                      dependency_nodes_count)                  \
+  iree_hal_hip_graph_tracing_zone_end_impl(context, event_list, out_node,      \
+                                           graph, verbosity, dependency_nodes, \
                                            dependency_nodes_count)
 #else
 
-#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream)
-#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(                  \
-    context, event_list, stream, file_name, file_name_length, line, \
+#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream, verbosity)
+#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(                             \
+    context, event_list, stream, verbosity, file_name, file_name_length, line, \
     function_name, function_name_length, name, name_length)
 #define IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL(                             \
-    context, event_list, out_node, graph, dependency_nodes,                   \
+    context, event_list, out_node, graph, verbosity, dependency_nodes,        \
     dependency_nodes_count, file_name, file_name_length, line, function_name, \
     function_name_length, name, name_length)
-#define IREE_HIP_STREAM_TRACE_ZONE_END(context, evnet_list, stream)
+#define IREE_HIP_STREAM_TRACE_ZONE_END(context, evnet_list, stream, verbosity)
 #define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \
-                                      dependency_nodes,                     \
+                                      verbosity, dependency_nodes,          \
                                       dependency_nodes_count)
 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE