[HAL/AMDGPU] Sample counter ranges on a profile queue

Counter range profiling was hard-coded to host queue 0. That is also the deterministic target for IREE_HAL_QUEUE_AFFINITY_ANY, so a 1ms periodic flush could not interrupt a long submitted workload: the range stop/start packets sat behind the work they were supposed to sample.

Route counter range enable/start/flush through a small queue-selection helper and use the final host queue for range sampling, with queue 0 as the one-queue fallback. This keeps the default queue available for ordinary submissions while letting the profiling flusher run near its requested cadence on devices with multiple host queues.

A 100-iteration SDXL prompt-encoder capture at a 1ms flush interval moved from 204 device-time-range samples with 5.47ms average range duration to 1102 samples with 0.98ms average range duration. Add coverage that device-time-range counter samples are accepted by the test sink and are emitted on the selected profile queue.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
index 48d316f..8db3484 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
+++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
@@ -920,7 +920,20 @@
       }
       EXPECT_NE(0u, record.sample_id);
       EXPECT_NE(0u, record.counter_set_id);
-      EXPECT_NE(0u, record.dispatch_event_id);
+      switch (record.scope) {
+        case IREE_HAL_PROFILE_COUNTER_SAMPLE_SCOPE_DISPATCH:
+          EXPECT_NE(0u, record.dispatch_event_id);
+          break;
+        case IREE_HAL_PROFILE_COUNTER_SAMPLE_SCOPE_DEVICE_TIME_RANGE:
+          EXPECT_EQ(0u, record.dispatch_event_id);
+          EXPECT_TRUE(iree_any_bit_set(
+              record.flags,
+              IREE_HAL_PROFILE_COUNTER_SAMPLE_FLAG_DEVICE_TICK_RANGE));
+          break;
+        default:
+          ADD_FAILURE() << "unexpected counter sample scope " << record.scope;
+          break;
+      }
       EXPECT_GT(record.sample_value_count, 0u);
       EXPECT_EQ(record.record_length,
                 sizeof(record) +
@@ -1125,6 +1138,26 @@
       profiling, sink, IREE_ARRAYSIZE(counter_names), counter_names);
 }
 
+static iree_status_t BeginSqWavesCounterRangeProfiling(
+    DeviceProfilingScope* profiling, CommandBufferProfileSink* sink) {
+  iree_string_view_t counter_names[] = {
+      IREE_SV("SQ_WAVES"),
+  };
+  iree_hal_profile_counter_set_selection_t counter_set = {
+      /*.flags=*/IREE_HAL_PROFILE_COUNTER_SET_SELECTION_FLAG_NONE,
+      /*.name=*/IREE_SV("smoke"),
+      /*.counter_name_count=*/IREE_ARRAYSIZE(counter_names),
+      /*.counter_names=*/counter_names,
+  };
+  iree_hal_device_profiling_options_t profiling_options = {0};
+  profiling_options.data_families =
+      IREE_HAL_DEVICE_PROFILING_DATA_COUNTER_RANGES;
+  profiling_options.sink = CommandBufferProfileSinkAsBase(sink);
+  profiling_options.counter_set_count = 1;
+  profiling_options.counter_sets = &counter_set;
+  return profiling->Begin(&profiling_options);
+}
+
 static iree_status_t BeginSqWaveWidthProfiling(DeviceProfilingScope* profiling,
                                                CommandBufferProfileSink* sink) {
   iree_string_view_t counter_names[] = {
@@ -2557,6 +2590,69 @@
 }
 
 TEST_F(HostQueueCommandBufferTest,
+       CounterRangeSamplesUseProfileQueueWhenAvailable) {
+  iree_hal_amdgpu_logical_device_options_t options;
+  iree_hal_amdgpu_logical_device_options_initialize(&options);
+  options.preallocate_pools = 0;
+
+  TestLogicalDevice test_device;
+  IREE_ASSERT_OK(
+      test_device.Initialize(&options, &libhsa_, &topology_, host_allocator_));
+  iree_hal_amdgpu_logical_device_t* logical_device =
+      test_device.logical_device();
+  ASSERT_GT(logical_device->physical_device_count, 0u);
+  for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
+    ASSERT_GT(logical_device->physical_devices[i]->host_queue_count, 0u);
+  }
+
+  CommandBufferProfileSink sink = {};
+  CommandBufferProfileSinkInitialize(&sink);
+  DeviceProfilingScope profiling(test_device.base_device());
+  iree_status_t profiling_status =
+      BeginSqWavesCounterRangeProfiling(&profiling, &sink);
+  if (IsHardwareCounterProfilingUnavailable(profiling_status)) {
+    iree_status_free(profiling_status);
+    GTEST_SKIP() << "AMDGPU hardware counter range profiling unavailable";
+  }
+  IREE_ASSERT_OK(profiling_status);
+
+  IREE_ASSERT_OK(iree_hal_device_profiling_flush(test_device.base_device()));
+  IREE_ASSERT_OK(profiling.End());
+
+  EXPECT_EQ(1, sink.begin_count);
+  EXPECT_EQ(1, sink.end_count);
+  EXPECT_EQ(1, sink.counter_set_metadata_count);
+  EXPECT_EQ(1, sink.counter_metadata_count);
+  EXPECT_GE(sink.counter_sample_count, 1);
+  ASSERT_FALSE(sink.counter_samples.empty());
+  iree_host_size_t sample_value_count = 0;
+  for (const iree_hal_profile_counter_sample_record_t& sample :
+       sink.counter_samples) {
+    EXPECT_TRUE(iree_all_bits_set(
+        sample.flags, IREE_HAL_PROFILE_COUNTER_SAMPLE_FLAG_DEVICE_TICK_RANGE));
+    EXPECT_EQ(IREE_HAL_PROFILE_COUNTER_SAMPLE_SCOPE_DEVICE_TIME_RANGE,
+              sample.scope);
+    EXPECT_EQ(0u, sample.dispatch_event_id);
+    EXPECT_EQ(0u, sample.submission_id);
+    EXPECT_EQ(0u, sample.command_buffer_id);
+    EXPECT_EQ(0u, sample.executable_id);
+    EXPECT_EQ(UINT32_MAX, sample.command_index);
+    EXPECT_EQ(UINT32_MAX, sample.export_ordinal);
+    ASSERT_LT(sample.physical_device_ordinal,
+              logical_device->physical_device_count);
+    const iree_hal_amdgpu_physical_device_t* physical_device =
+        logical_device->physical_devices[sample.physical_device_ordinal];
+    const uint32_t expected_queue_ordinal =
+        (uint32_t)(physical_device->host_queue_count - 1);
+    EXPECT_EQ(sample.physical_device_ordinal, physical_device->device_ordinal);
+    EXPECT_EQ(expected_queue_ordinal, sample.queue_ordinal);
+    EXPECT_LT(sample.start_tick, sample.end_tick);
+    sample_value_count += sample.sample_value_count;
+  }
+  ASSERT_EQ(sample_value_count, sink.counter_sample_values.size());
+}
+
+TEST_F(HostQueueCommandBufferTest,
        ProfilingFlushCounterSampleWriteFailurePreservesSamplesForRetry) {
   iree_hal_amdgpu_logical_device_options_t options;
   iree_hal_amdgpu_logical_device_options_initialize(&options);
diff --git a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
index db5f3cc..92c31f6 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
@@ -866,6 +866,26 @@
   return status;
 }
 
+// Returns true when |queue_ordinal| is the physical device's counter range
+// sampling queue.
+//
+// Queue affinity ANY resolves to queue 0 for ordinary submissions, so using
+// the final queue gives the sampler the best chance to run independently while
+// the default queue is saturated. When only one queue exists we fall back to
+// that queue and sampling is necessarily ordered behind user work.
+static bool iree_hal_amdgpu_logical_device_is_profile_counter_range_queue(
+    const iree_hal_amdgpu_physical_device_t* physical_device,
+    iree_host_size_t queue_ordinal) {
+  return queue_ordinal + 1 == physical_device->host_queue_count;
+}
+
+static iree_hal_amdgpu_host_queue_t*
+iree_hal_amdgpu_logical_device_select_profile_counter_range_queue(
+    iree_hal_amdgpu_physical_device_t* physical_device) {
+  if (physical_device->host_queue_count == 0) return NULL;
+  return &physical_device->host_queues[physical_device->host_queue_count - 1];
+}
+
 static iree_status_t
 iree_hal_amdgpu_logical_device_set_counter_profiling_enabled(
     iree_hal_amdgpu_logical_device_t* logical_device,
@@ -898,7 +918,9 @@
         if (capture_dispatch_samples) {
           flags |= IREE_HAL_AMDGPU_PROFILE_COUNTER_ENABLE_FLAG_DISPATCH_SAMPLES;
         }
-        if (j == 0 && capture_queue_ranges) {
+        if (capture_queue_ranges &&
+            iree_hal_amdgpu_logical_device_is_profile_counter_range_queue(
+                physical_device, j)) {
           flags |= IREE_HAL_AMDGPU_PROFILE_COUNTER_ENABLE_FLAG_QUEUE_RANGES;
         }
         status = iree_hal_amdgpu_host_queue_enable_profile_counters(
@@ -947,8 +969,10 @@
                                 "logical device physical device has no host "
                                 "queues (initialization incomplete)");
     } else {
-      status = iree_hal_amdgpu_host_queue_start_profile_counter_ranges(
-          &physical_device->host_queues[0]);
+      iree_hal_amdgpu_host_queue_t* queue =
+          iree_hal_amdgpu_logical_device_select_profile_counter_range_queue(
+              physical_device);
+      status = iree_hal_amdgpu_host_queue_start_profile_counter_ranges(queue);
       if (iree_status_is_ok(status)) {
         ++started_device_count;
       }
@@ -959,10 +983,12 @@
     for (iree_host_size_t i = 0; i < started_device_count; ++i) {
       iree_hal_amdgpu_physical_device_t* physical_device =
           logical_device->physical_devices[i];
+      iree_hal_amdgpu_host_queue_t* queue =
+          iree_hal_amdgpu_logical_device_select_profile_counter_range_queue(
+              physical_device);
       status = iree_status_join(
           status, iree_hal_amdgpu_host_queue_flush_profile_counter_ranges(
-                      &physical_device->host_queues[0], /*sink=*/NULL,
-                      /*session_id=*/0,
+                      queue, /*sink=*/NULL, /*session_id=*/0,
                       IREE_HAL_AMDGPU_PROFILE_COUNTER_RANGE_FLUSH_FLAG_NONE));
     }
   }
@@ -994,8 +1020,11 @@
                                 "logical device physical device has no host "
                                 "queues (initialization incomplete)");
     } else {
+      iree_hal_amdgpu_host_queue_t* queue =
+          iree_hal_amdgpu_logical_device_select_profile_counter_range_queue(
+              physical_device);
       status = iree_hal_amdgpu_host_queue_flush_profile_counter_ranges(
-          &physical_device->host_queues[0], sink, session_id, flags);
+          queue, sink, session_id, flags);
     }
   }