[HAL/AMDGPU] Sample counter ranges on a profile queue Counter range profiling was hard-coded to host queue 0. That is also the deterministic target for IREE_HAL_QUEUE_AFFINITY_ANY, so a 1ms periodic flush could not interrupt a long submitted workload: the range stop/start packets sat behind the work they were supposed to sample. Route counter range enable/start/flush through a small queue-selection helper and use the final host queue for range sampling, with queue 0 as the one-queue fallback. This keeps the default queue available for ordinary submissions while letting the profiling flusher run near its requested cadence on devices with multiple host queues. A 100-iteration SDXL prompt-encoder capture at a 1ms flush interval moved from 204 device-time-range samples with 5.47ms average range duration to 1102 samples with 0.98ms average range duration. Add coverage that device-time-range counter samples are accepted by the test sink and are emitted on the selected profile queue.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc index 48d316f..8db3484 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc +++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
@@ -920,7 +920,20 @@ } EXPECT_NE(0u, record.sample_id); EXPECT_NE(0u, record.counter_set_id); - EXPECT_NE(0u, record.dispatch_event_id); + switch (record.scope) { + case IREE_HAL_PROFILE_COUNTER_SAMPLE_SCOPE_DISPATCH: + EXPECT_NE(0u, record.dispatch_event_id); + break; + case IREE_HAL_PROFILE_COUNTER_SAMPLE_SCOPE_DEVICE_TIME_RANGE: + EXPECT_EQ(0u, record.dispatch_event_id); + EXPECT_TRUE(iree_any_bit_set( + record.flags, + IREE_HAL_PROFILE_COUNTER_SAMPLE_FLAG_DEVICE_TICK_RANGE)); + break; + default: + ADD_FAILURE() << "unexpected counter sample scope " << record.scope; + break; + } EXPECT_GT(record.sample_value_count, 0u); EXPECT_EQ(record.record_length, sizeof(record) + @@ -1125,6 +1138,26 @@ profiling, sink, IREE_ARRAYSIZE(counter_names), counter_names); } +static iree_status_t BeginSqWavesCounterRangeProfiling( + DeviceProfilingScope* profiling, CommandBufferProfileSink* sink) { + iree_string_view_t counter_names[] = { + IREE_SV("SQ_WAVES"), + }; + iree_hal_profile_counter_set_selection_t counter_set = { + /*.flags=*/IREE_HAL_PROFILE_COUNTER_SET_SELECTION_FLAG_NONE, + /*.name=*/IREE_SV("smoke"), + /*.counter_name_count=*/IREE_ARRAYSIZE(counter_names), + /*.counter_names=*/counter_names, + }; + iree_hal_device_profiling_options_t profiling_options = {0}; + profiling_options.data_families = + IREE_HAL_DEVICE_PROFILING_DATA_COUNTER_RANGES; + profiling_options.sink = CommandBufferProfileSinkAsBase(sink); + profiling_options.counter_set_count = 1; + profiling_options.counter_sets = &counter_set; + return profiling->Begin(&profiling_options); +} + static iree_status_t BeginSqWaveWidthProfiling(DeviceProfilingScope* profiling, CommandBufferProfileSink* sink) { iree_string_view_t counter_names[] = { @@ -2557,6 +2590,69 @@ } TEST_F(HostQueueCommandBufferTest, + CounterRangeSamplesUseProfileQueueWhenAvailable) { + iree_hal_amdgpu_logical_device_options_t options; + iree_hal_amdgpu_logical_device_options_initialize(&options); + options.preallocate_pools = 0; + + TestLogicalDevice test_device; + IREE_ASSERT_OK( + test_device.Initialize(&options, &libhsa_, &topology_, host_allocator_)); + iree_hal_amdgpu_logical_device_t* logical_device = + test_device.logical_device(); + ASSERT_GT(logical_device->physical_device_count, 0u); + for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) { + ASSERT_GT(logical_device->physical_devices[i]->host_queue_count, 0u); + } + + CommandBufferProfileSink sink = {}; + CommandBufferProfileSinkInitialize(&sink); + DeviceProfilingScope profiling(test_device.base_device()); + iree_status_t profiling_status = + BeginSqWavesCounterRangeProfiling(&profiling, &sink); + if (IsHardwareCounterProfilingUnavailable(profiling_status)) { + iree_status_free(profiling_status); + GTEST_SKIP() << "AMDGPU hardware counter range profiling unavailable"; + } + IREE_ASSERT_OK(profiling_status); + + IREE_ASSERT_OK(iree_hal_device_profiling_flush(test_device.base_device())); + IREE_ASSERT_OK(profiling.End()); + + EXPECT_EQ(1, sink.begin_count); + EXPECT_EQ(1, sink.end_count); + EXPECT_EQ(1, sink.counter_set_metadata_count); + EXPECT_EQ(1, sink.counter_metadata_count); + EXPECT_GE(sink.counter_sample_count, 1); + ASSERT_FALSE(sink.counter_samples.empty()); + iree_host_size_t sample_value_count = 0; + for (const iree_hal_profile_counter_sample_record_t& sample : + sink.counter_samples) { + EXPECT_TRUE(iree_all_bits_set( + sample.flags, IREE_HAL_PROFILE_COUNTER_SAMPLE_FLAG_DEVICE_TICK_RANGE)); + EXPECT_EQ(IREE_HAL_PROFILE_COUNTER_SAMPLE_SCOPE_DEVICE_TIME_RANGE, + sample.scope); + EXPECT_EQ(0u, sample.dispatch_event_id); + EXPECT_EQ(0u, sample.submission_id); + EXPECT_EQ(0u, sample.command_buffer_id); + EXPECT_EQ(0u, sample.executable_id); + EXPECT_EQ(UINT32_MAX, sample.command_index); + EXPECT_EQ(UINT32_MAX, sample.export_ordinal); + ASSERT_LT(sample.physical_device_ordinal, + logical_device->physical_device_count); + const iree_hal_amdgpu_physical_device_t* physical_device = + logical_device->physical_devices[sample.physical_device_ordinal]; + const uint32_t expected_queue_ordinal = + (uint32_t)(physical_device->host_queue_count - 1); + EXPECT_EQ(sample.physical_device_ordinal, physical_device->device_ordinal); + EXPECT_EQ(expected_queue_ordinal, sample.queue_ordinal); + EXPECT_LT(sample.start_tick, sample.end_tick); + sample_value_count += sample.sample_value_count; + } + ASSERT_EQ(sample_value_count, sink.counter_sample_values.size()); +} + +TEST_F(HostQueueCommandBufferTest, ProfilingFlushCounterSampleWriteFailurePreservesSamplesForRetry) { iree_hal_amdgpu_logical_device_options_t options; iree_hal_amdgpu_logical_device_options_initialize(&options);
diff --git a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c index db5f3cc..92c31f6 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c +++ b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
@@ -866,6 +866,26 @@ return status; } +// Returns true when |queue_ordinal| is the physical device's counter range +// sampling queue. +// +// Queue affinity ANY resolves to queue 0 for ordinary submissions, so using +// the final queue gives the sampler the best chance to run independently while +// the default queue is saturated. When only one queue exists we fall back to +// that queue and sampling is necessarily ordered behind user work. +static bool iree_hal_amdgpu_logical_device_is_profile_counter_range_queue( + const iree_hal_amdgpu_physical_device_t* physical_device, + iree_host_size_t queue_ordinal) { + return queue_ordinal + 1 == physical_device->host_queue_count; +} + +static iree_hal_amdgpu_host_queue_t* +iree_hal_amdgpu_logical_device_select_profile_counter_range_queue( + iree_hal_amdgpu_physical_device_t* physical_device) { + if (physical_device->host_queue_count == 0) return NULL; + return &physical_device->host_queues[physical_device->host_queue_count - 1]; +} + static iree_status_t iree_hal_amdgpu_logical_device_set_counter_profiling_enabled( iree_hal_amdgpu_logical_device_t* logical_device, @@ -898,7 +918,9 @@ if (capture_dispatch_samples) { flags |= IREE_HAL_AMDGPU_PROFILE_COUNTER_ENABLE_FLAG_DISPATCH_SAMPLES; } - if (j == 0 && capture_queue_ranges) { + if (capture_queue_ranges && + iree_hal_amdgpu_logical_device_is_profile_counter_range_queue( + physical_device, j)) { flags |= IREE_HAL_AMDGPU_PROFILE_COUNTER_ENABLE_FLAG_QUEUE_RANGES; } status = iree_hal_amdgpu_host_queue_enable_profile_counters( @@ -947,8 +969,10 @@ "logical device physical device has no host " "queues (initialization incomplete)"); } else { - status = iree_hal_amdgpu_host_queue_start_profile_counter_ranges( - &physical_device->host_queues[0]); + iree_hal_amdgpu_host_queue_t* queue = + iree_hal_amdgpu_logical_device_select_profile_counter_range_queue( + physical_device); + status = iree_hal_amdgpu_host_queue_start_profile_counter_ranges(queue); if (iree_status_is_ok(status)) { ++started_device_count; } @@ -959,10 +983,12 @@ for (iree_host_size_t i = 0; i < started_device_count; ++i) { iree_hal_amdgpu_physical_device_t* physical_device = logical_device->physical_devices[i]; + iree_hal_amdgpu_host_queue_t* queue = + iree_hal_amdgpu_logical_device_select_profile_counter_range_queue( + physical_device); status = iree_status_join( status, iree_hal_amdgpu_host_queue_flush_profile_counter_ranges( - &physical_device->host_queues[0], /*sink=*/NULL, - /*session_id=*/0, + queue, /*sink=*/NULL, /*session_id=*/0, IREE_HAL_AMDGPU_PROFILE_COUNTER_RANGE_FLUSH_FLAG_NONE)); } } @@ -994,8 +1020,11 @@ "logical device physical device has no host " "queues (initialization incomplete)"); } else { + iree_hal_amdgpu_host_queue_t* queue = + iree_hal_amdgpu_logical_device_select_profile_counter_range_queue( + physical_device); status = iree_hal_amdgpu_host_queue_flush_profile_counter_ranges( - &physical_device->host_queues[0], sink, session_id, flags); + queue, sink, session_id, flags); } }