blob: 8270f804b53b21a58741641bc3fca89a64b134ce [file]
// Copyright 2025 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/hal/drivers/amdgpu/logical_device.h"
#include "iree/async/frontier.h"
#include "iree/async/frontier_tracker.h"
#include "iree/async/util/proactor_pool.h"
#include "iree/hal/drivers/amdgpu/abi/signal.h"
#include "iree/hal/drivers/amdgpu/allocator.h"
#include "iree/hal/drivers/amdgpu/api.h"
#include "iree/hal/drivers/amdgpu/aql_command_buffer.h"
#include "iree/hal/drivers/amdgpu/aql_program_builder.h"
#include "iree/hal/drivers/amdgpu/executable.h"
#include "iree/hal/drivers/amdgpu/executable_cache.h"
#include "iree/hal/drivers/amdgpu/host_queue_profile.h"
#include "iree/hal/drivers/amdgpu/host_queue_profile_events.h"
#include "iree/hal/drivers/amdgpu/physical_device.h"
#include "iree/hal/drivers/amdgpu/profile_counters.h"
#include "iree/hal/drivers/amdgpu/profile_device_metrics.h"
#include "iree/hal/drivers/amdgpu/profile_traces.h"
#include "iree/hal/drivers/amdgpu/queue_affinity.h"
#include "iree/hal/drivers/amdgpu/semaphore.h"
#include "iree/hal/drivers/amdgpu/system.h"
#include "iree/hal/drivers/amdgpu/util/epoch_signal_table.h"
#include "iree/hal/drivers/amdgpu/util/kfd.h"
#include "iree/hal/drivers/amdgpu/util/notification_ring.h"
#include "iree/hal/drivers/amdgpu/util/topology.h"
#include "iree/hal/drivers/amdgpu/util/vmem.h"
#include "iree/hal/utils/file_registry.h"
//===----------------------------------------------------------------------===//
// Utilities
//===----------------------------------------------------------------------===//
static iree_hal_amdgpu_queue_affinity_domain_t
iree_hal_amdgpu_logical_device_queue_affinity_domain(
const iree_hal_amdgpu_logical_device_t* logical_device) {
return (iree_hal_amdgpu_queue_affinity_domain_t){
.supported_affinity = logical_device->queue_affinity_mask,
.physical_device_count = logical_device->physical_device_count,
.queue_count_per_physical_device =
logical_device->system->topology.gpu_agent_queue_count,
};
}
// Returns the queue for a flattened logical queue ordinal.
static iree_status_t iree_hal_amdgpu_logical_device_queue_from_ordinal(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_host_size_t queue_ordinal,
iree_hal_amdgpu_virtual_queue_t** out_queue) {
IREE_ASSERT_ARGUMENT(logical_device);
IREE_ASSERT_ARGUMENT(out_queue);
*out_queue = NULL;
iree_hal_amdgpu_queue_affinity_resolved_t resolved;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_resolve_ordinal(
iree_hal_amdgpu_logical_device_queue_affinity_domain(logical_device),
queue_ordinal, &resolved));
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[resolved.physical_device_ordinal];
if (IREE_UNLIKELY(resolved.physical_queue_ordinal >=
physical_device->host_queue_count)) {
return iree_make_status(IREE_STATUS_INTERNAL,
"queue affinity ordinal %" PRIhsz
" maps to invalid host queue ordinal "
"%" PRIhsz " on physical device %" PRIhsz,
queue_ordinal, resolved.physical_queue_ordinal,
resolved.physical_device_ordinal);
}
*out_queue =
&physical_device->host_queues[resolved.physical_queue_ordinal].base;
return iree_ok_status();
}
//===----------------------------------------------------------------------===//
// iree_hal_amdgpu_logical_device_options_t
//===----------------------------------------------------------------------===//
// Power-of-two size for the shared host small block pool in bytes.
// Used for small host-side transients/wrappers of device-side resources.
#define IREE_HAL_AMDGPU_LOGICAL_DEVICE_DEFAULT_SMALL_HOST_BLOCK_SIZE (8 * 1024)
// Minimum size of a small host block (some structures require at least this
// much memory).
#define IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_SMALL_HOST_BLOCK_SIZE (4 * 1024)
// Power-of-two size for the shared host large block pool in bytes.
// Used for resource tracking and other larger host-side transients.
#define IREE_HAL_AMDGPU_LOGICAL_DEVICE_DEFAULT_LARGE_HOST_BLOCK_SIZE (64 * 1024)
// Minimum size of a large host block (some structures require at least this
// much memory).
#define IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_LARGE_HOST_BLOCK_SIZE (64 * 1024)
IREE_API_EXPORT void iree_hal_amdgpu_logical_device_options_initialize(
iree_hal_amdgpu_logical_device_options_t* out_options) {
IREE_ASSERT_ARGUMENT(out_options);
memset(out_options, 0, sizeof(*out_options));
// TODO(benvanik): set defaults based on compiler configuration. Flags should
// not be used as multiple devices may be configured within the process or the
// hosting application may be authored in python/etc that does not use a flags
// mechanism accessible here.
out_options->host_block_pools.small.block_size =
IREE_HAL_AMDGPU_LOGICAL_DEVICE_DEFAULT_SMALL_HOST_BLOCK_SIZE;
out_options->host_block_pools.large.block_size =
IREE_HAL_AMDGPU_LOGICAL_DEVICE_DEFAULT_LARGE_HOST_BLOCK_SIZE;
out_options->host_block_pools.command_buffer.usable_block_size =
IREE_HAL_AMDGPU_AQL_PROGRAM_DEFAULT_BLOCK_SIZE;
out_options->device_block_pools.small.block_size =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_SMALL_DEVICE_BLOCK_SIZE_DEFAULT;
out_options->device_block_pools.small.initial_capacity =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_SMALL_DEVICE_BLOCK_INITIAL_CAPACITY_DEFAULT;
out_options->device_block_pools.large.block_size =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_LARGE_DEVICE_BLOCK_SIZE_DEFAULT;
out_options->device_block_pools.large.initial_capacity =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_LARGE_DEVICE_BLOCK_INITIAL_CAPACITY_DEFAULT;
out_options->default_pool.range_length =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_POOL_RANGE_LENGTH_DEFAULT;
out_options->default_pool.alignment =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_POOL_ALIGNMENT_DEFAULT;
out_options->default_pool.frontier_capacity =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_POOL_FRONTIER_CAPACITY_DEFAULT;
out_options->queue_placement = IREE_HAL_AMDGPU_QUEUE_PLACEMENT_ANY;
out_options->host_queues.aql_capacity =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_AQL_CAPACITY;
out_options->host_queues.notification_capacity =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_NOTIFICATION_CAPACITY;
out_options->host_queues.kernarg_capacity =
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_KERNARG_CAPACITY;
out_options->preallocate_pools = 1;
}
IREE_API_EXPORT iree_status_t iree_hal_amdgpu_logical_device_options_parse(
iree_hal_amdgpu_logical_device_options_t* options,
iree_string_pair_list_t params) {
IREE_ASSERT_ARGUMENT(options);
if (!params.count) return iree_ok_status();
IREE_TRACE_ZONE_BEGIN(z0);
const iree_string_pair_t* first_param = &params.pairs[0];
iree_status_t status = iree_make_status(
IREE_STATUS_INVALID_ARGUMENT,
"AMDGPU logical device options do not support key/value parameter '%.*s'",
(int)first_param->key.size, first_param->key.data);
IREE_TRACE_ZONE_END(z0);
return status;
}
iree_status_t iree_hal_amdgpu_logical_device_options_verify_supported_features(
const iree_hal_amdgpu_logical_device_options_t* options) {
IREE_ASSERT_ARGUMENT(options);
switch (options->queue_placement) {
case IREE_HAL_AMDGPU_QUEUE_PLACEMENT_ANY:
case IREE_HAL_AMDGPU_QUEUE_PLACEMENT_HOST:
break;
case IREE_HAL_AMDGPU_QUEUE_PLACEMENT_DEVICE:
return iree_make_status(
IREE_STATUS_UNIMPLEMENTED,
"AMDGPU device queue placement is not implemented; use "
"queue_placement=any or queue_placement=host");
default:
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"invalid AMDGPU queue placement value %u",
(uint32_t)options->queue_placement);
}
if (options->exclusive_execution) {
return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
"AMDGPU exclusive_execution is not implemented");
}
if (options->wait_active_for_ns < 0) {
return iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"AMDGPU wait_active_for_ns must be non-negative (got %" PRId64 ")",
options->wait_active_for_ns);
}
if (options->wait_active_for_ns != 0) {
return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
"AMDGPU wait_active_for_ns is not implemented; "
"use 0");
}
return iree_ok_status();
}
static iree_status_t iree_hal_amdgpu_logical_device_options_verify(
const iree_hal_amdgpu_logical_device_options_t* options,
const iree_hal_amdgpu_libhsa_t* libhsa,
const iree_hal_amdgpu_topology_t* topology) {
IREE_ASSERT_ARGUMENT(options);
IREE_ASSERT_ARGUMENT(topology);
IREE_TRACE_ZONE_BEGIN(z0);
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_amdgpu_logical_device_options_verify_supported_features(
options));
if (options->host_block_pools.small.block_size <
IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_SMALL_HOST_BLOCK_SIZE ||
!iree_host_size_is_power_of_two(
options->host_block_pools.small.block_size)) {
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"small host block pool size invalid, expected a "
"power-of-two greater than %d and got %" PRIhsz,
IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_SMALL_HOST_BLOCK_SIZE,
options->host_block_pools.small.block_size));
}
if (options->host_block_pools.large.block_size <
IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_LARGE_HOST_BLOCK_SIZE ||
!iree_host_size_is_power_of_two(
options->host_block_pools.large.block_size)) {
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"large host block pool size invalid, expected a "
"power-of-two greater than %d and got %" PRIhsz,
IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_LARGE_HOST_BLOCK_SIZE,
options->host_block_pools.large.block_size));
}
if (options->host_block_pools.command_buffer.usable_block_size <
IREE_HAL_AMDGPU_AQL_PROGRAM_MIN_BLOCK_SIZE ||
options->host_block_pools.command_buffer.usable_block_size > UINT32_MAX ||
!iree_host_size_is_power_of_two(
options->host_block_pools.command_buffer.usable_block_size)) {
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"command-buffer host block pool usable size invalid, expected "
"a power-of-two between %u and %u and got %" PRIhsz,
IREE_HAL_AMDGPU_AQL_PROGRAM_MIN_BLOCK_SIZE, UINT32_MAX,
options->host_block_pools.command_buffer.usable_block_size));
}
if (topology->gpu_agent_queue_count > UINT8_MAX) {
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_make_status(IREE_STATUS_OUT_OF_RANGE,
"gpu_agent_queue_count=%" PRIhsz
" exceeds the queue-axis encoding limit (%u)",
topology->gpu_agent_queue_count, UINT8_MAX));
}
iree_host_size_t total_queue_count = 0;
if (!iree_host_size_checked_mul(topology->gpu_agent_count,
topology->gpu_agent_queue_count,
&total_queue_count) ||
total_queue_count > IREE_HAL_MAX_QUEUES) {
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0,
iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"topology queue space does not fit in iree_hal_queue_affinity_t "
"(gpu_agent_count=%" PRIhsz ", gpu_agent_queue_count=%" PRIhsz
", max_total_queues=%" PRIhsz ")",
topology->gpu_agent_count, topology->gpu_agent_queue_count,
(iree_host_size_t)IREE_HAL_MAX_QUEUES));
}
if (!iree_host_size_is_power_of_two(options->host_queues.aql_capacity) ||
!iree_host_size_is_power_of_two(
options->host_queues.notification_capacity) ||
!iree_host_size_is_power_of_two(options->host_queues.kernarg_capacity)) {
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"host queue AQL, notification, and kernarg capacities must all "
"be powers of two (got aql=%u, notification=%u, "
"kernarg_blocks=%u)",
options->host_queues.aql_capacity,
options->host_queues.notification_capacity,
options->host_queues.kernarg_capacity));
}
if (options->host_queues.kernarg_capacity / 2u <
options->host_queues.aql_capacity) {
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"host queue kernarg capacity must be at least 2x the AQL queue "
"capacity (got kernarg_blocks=%u, aql_packets=%u)",
options->host_queues.kernarg_capacity,
options->host_queues.aql_capacity));
}
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
//===----------------------------------------------------------------------===//
// iree_hal_amdgpu_logical_device_t
//===----------------------------------------------------------------------===//
static const iree_hal_device_vtable_t iree_hal_amdgpu_logical_device_vtable;
static iree_hal_amdgpu_logical_device_t* iree_hal_amdgpu_logical_device_cast(
iree_hal_device_t* base_value) {
IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_amdgpu_logical_device_vtable);
return (iree_hal_amdgpu_logical_device_t*)base_value;
}
static bool iree_hal_amdgpu_logical_device_profiling_needs_hsa_timestamps(
iree_hal_device_profiling_data_families_t data_families) {
return iree_any_bit_set(data_families,
IREE_HAL_DEVICE_PROFILING_DATA_DEVICE_QUEUE_EVENTS |
IREE_HAL_DEVICE_PROFILING_DATA_DISPATCH_EVENTS |
IREE_HAL_DEVICE_PROFILING_DATA_COUNTER_SAMPLES |
IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_TRACES);
}
static iree_hal_device_profiling_data_families_t
iree_hal_amdgpu_logical_device_lightweight_statistics_data_families(void) {
return IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_METADATA |
IREE_HAL_DEVICE_PROFILING_DATA_DEVICE_QUEUE_EVENTS |
IREE_HAL_DEVICE_PROFILING_DATA_DISPATCH_EVENTS;
}
static iree_hal_device_profiling_options_t
iree_hal_amdgpu_logical_device_resolve_profiling_options(
const iree_hal_device_profiling_options_t* options) {
iree_hal_device_profiling_options_t resolved_options = *options;
if (resolved_options.data_families == IREE_HAL_DEVICE_PROFILING_DATA_NONE &&
iree_hal_device_profiling_options_requests_lightweight_statistics(
options)) {
resolved_options.data_families =
iree_hal_amdgpu_logical_device_lightweight_statistics_data_families();
}
resolved_options.flags &=
~IREE_HAL_DEVICE_PROFILING_FLAG_LIGHTWEIGHT_STATISTICS;
return resolved_options;
}
// Power-of-two capacity for logical-device memory lifecycle event buffering.
#define IREE_HAL_AMDGPU_LOGICAL_DEVICE_PROFILE_MEMORY_EVENT_CAPACITY (64 * 1024)
// Power-of-two capacity for logical-device queue operation event buffering.
#define IREE_HAL_AMDGPU_LOGICAL_DEVICE_PROFILE_QUEUE_EVENT_CAPACITY (64 * 1024)
static iree_hal_profile_chunk_metadata_t
iree_hal_amdgpu_logical_device_profile_session_metadata(
iree_hal_amdgpu_logical_device_t* logical_device, uint64_t session_id) {
iree_hal_profile_chunk_metadata_t metadata =
iree_hal_profile_chunk_metadata_default();
metadata.content_type = IREE_HAL_PROFILE_CONTENT_TYPE_SESSION;
metadata.name = logical_device->identifier;
metadata.session_id = session_id;
return metadata;
}
static uint64_t iree_hal_amdgpu_logical_device_profile_queue_stream_id(
uint32_t physical_device_ordinal, uint32_t queue_ordinal) {
return ((uint64_t)physical_device_ordinal << 32) | (uint64_t)queue_ordinal;
}
static bool iree_hal_amdgpu_logical_device_profile_memory_events_requested(
const iree_hal_amdgpu_logical_device_t* logical_device) {
return iree_hal_device_profiling_options_requests_data(
&logical_device->profiling.options,
IREE_HAL_DEVICE_PROFILING_DATA_MEMORY_EVENTS) &&
logical_device->profiling.options.sink &&
iree_hal_amdgpu_profile_event_streams_has_memory_storage(
&logical_device->profiling.event_streams);
}
bool iree_hal_amdgpu_logical_device_should_record_profile_memory_events(
iree_hal_device_t* base_device) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
return iree_hal_amdgpu_logical_device_profile_memory_events_requested(
logical_device);
}
static void iree_hal_amdgpu_logical_device_reset_profile_options(
iree_hal_amdgpu_logical_device_t* logical_device) {
iree_hal_device_profiling_options_storage_free(
logical_device->profiling.options_storage,
logical_device->host_allocator);
logical_device->profiling.options_storage = NULL;
logical_device->profiling.options = (iree_hal_device_profiling_options_t){0};
}
bool iree_hal_amdgpu_logical_device_should_profile_dispatch(
iree_hal_amdgpu_logical_device_t* logical_device, uint64_t executable_id,
uint32_t export_ordinal, uint64_t command_buffer_id, uint32_t command_index,
uint32_t physical_device_ordinal, uint32_t queue_ordinal) {
if (!iree_any_bit_set(logical_device->profiling.options.data_families,
IREE_HAL_DEVICE_PROFILING_DATA_DISPATCH_EVENTS |
IREE_HAL_DEVICE_PROFILING_DATA_COUNTER_SAMPLES |
IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_TRACES)) {
return false;
}
const iree_hal_profile_capture_filter_t* filter =
&logical_device->profiling.options.capture_filter;
if (!iree_hal_profile_capture_filter_matches_location(
filter, command_buffer_id, command_index, physical_device_ordinal,
queue_ordinal)) {
return false;
}
if (iree_any_bit_set(
filter->flags,
IREE_HAL_PROFILE_CAPTURE_FILTER_FLAG_EXECUTABLE_EXPORT_PATTERN)) {
return iree_hal_amdgpu_profile_metadata_export_matches(
&logical_device->profile_metadata, executable_id, export_ordinal,
filter->executable_export_pattern);
}
return true;
}
uint64_t iree_hal_amdgpu_logical_device_allocate_profile_memory_allocation_id(
iree_hal_device_t* base_device, uint64_t* out_session_id) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
*out_session_id = 0;
if (!iree_hal_amdgpu_logical_device_profile_memory_events_requested(
logical_device)) {
return 0;
}
return iree_hal_amdgpu_profile_event_streams_allocate_memory_allocation_id(
&logical_device->profiling.event_streams,
logical_device->profiling.session_id, out_session_id);
}
bool iree_hal_amdgpu_logical_device_record_profile_memory_event_for_session(
iree_hal_device_t* base_device, uint64_t session_id,
const iree_hal_profile_memory_event_t* event) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
if (!iree_hal_amdgpu_logical_device_profile_memory_events_requested(
logical_device)) {
return false;
}
return iree_hal_amdgpu_profile_event_streams_record_memory_event(
&logical_device->profiling.event_streams,
logical_device->profiling.session_id, session_id, event);
}
bool iree_hal_amdgpu_logical_device_record_profile_memory_event(
iree_hal_device_t* base_device,
const iree_hal_profile_memory_event_t* event) {
return iree_hal_amdgpu_logical_device_record_profile_memory_event_for_session(
base_device, /*session_id=*/0, event);
}
static bool iree_hal_amdgpu_logical_device_profile_queue_events_requested(
const iree_hal_amdgpu_logical_device_t* logical_device) {
return iree_hal_device_profiling_options_requests_data(
&logical_device->profiling.options,
IREE_HAL_DEVICE_PROFILING_DATA_QUEUE_EVENTS) &&
logical_device->profiling.options.sink &&
iree_hal_amdgpu_profile_event_streams_has_queue_storage(
&logical_device->profiling.event_streams);
}
void iree_hal_amdgpu_logical_device_record_profile_queue_event(
iree_hal_device_t* base_device,
const iree_hal_profile_queue_event_t* event) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
if (!iree_hal_amdgpu_logical_device_profile_queue_events_requested(
logical_device)) {
return;
}
iree_hal_amdgpu_profile_event_streams_record_queue_event(
&logical_device->profiling.event_streams, event);
}
static iree_status_t
iree_hal_amdgpu_logical_device_sample_profile_clock_correlation(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_amdgpu_physical_device_t* physical_device,
iree_hal_profile_clock_correlation_record_t* out_record) {
if (IREE_UNLIKELY(physical_device->device_ordinal > UINT32_MAX)) {
return iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"profile clock correlation physical device ordinal out of range: "
"%" PRIhsz,
physical_device->device_ordinal);
}
iree_hal_amdgpu_clock_counters_t counters = {0};
const iree_time_t host_time_begin_ns = iree_time_now();
iree_status_t status = iree_hal_amdgpu_kfd_get_clock_counters(
logical_device->system->kfd_fd, physical_device->kfd_gpu_uid, &counters);
const iree_time_t host_time_end_ns = iree_time_now();
if (iree_status_is_ok(status)) {
*out_record = iree_hal_profile_clock_correlation_record_default();
out_record->flags =
IREE_HAL_PROFILE_CLOCK_CORRELATION_FLAG_DEVICE_TICK |
IREE_HAL_PROFILE_CLOCK_CORRELATION_FLAG_HOST_CPU_TIMESTAMP |
IREE_HAL_PROFILE_CLOCK_CORRELATION_FLAG_HOST_SYSTEM_TIMESTAMP |
IREE_HAL_PROFILE_CLOCK_CORRELATION_FLAG_HOST_TIME_BRACKET;
out_record->physical_device_ordinal =
(uint32_t)physical_device->device_ordinal;
out_record->sample_id =
logical_device->profiling.next_clock_correlation_sample_id++;
out_record->device_tick = counters.gpu_clock_counter;
out_record->host_cpu_timestamp_ns = counters.cpu_clock_counter;
out_record->host_system_timestamp = counters.system_clock_counter;
out_record->host_system_frequency_hz = counters.system_clock_freq;
out_record->host_time_begin_ns = host_time_begin_ns;
out_record->host_time_end_ns = host_time_end_ns;
} else {
status = iree_status_annotate_f(
status,
"sampling profile clock correlation for physical_device_ordinal=%zu "
"gpu_uid=%" PRIu32,
physical_device->device_ordinal, physical_device->kfd_gpu_uid);
}
return status;
}
static iree_status_t iree_hal_amdgpu_logical_device_write_profile_devices(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_profile_sink_t* sink, uint64_t session_id) {
IREE_TRACE_ZONE_BEGIN(z0);
const iree_host_size_t record_count = logical_device->physical_device_count;
if (record_count == 0) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(
IREE_STATUS_INTERNAL,
"logical device has no physical devices (initialization incomplete)");
}
iree_host_size_t records_size = 0;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, IREE_STRUCT_LAYOUT(
0, &records_size,
IREE_STRUCT_FIELD(record_count, iree_hal_profile_device_record_t,
NULL)));
iree_hal_profile_device_record_t* records = NULL;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_allocator_malloc(logical_device->host_allocator, records_size,
(void**)&records));
iree_status_t status = iree_ok_status();
for (iree_host_size_t i = 0; i < record_count && iree_status_is_ok(status);
++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
if (IREE_UNLIKELY(physical_device->device_ordinal > UINT32_MAX ||
physical_device->host_queue_count > UINT32_MAX)) {
status = iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"profile device metadata ordinals out of range: device=%" PRIhsz
", queue_count=%" PRIhsz,
physical_device->device_ordinal, physical_device->host_queue_count);
break;
}
records[i] = iree_hal_profile_device_record_default();
records[i].physical_device_ordinal =
(uint32_t)physical_device->device_ordinal;
records[i].queue_count = (uint32_t)physical_device->host_queue_count;
if (physical_device->has_physical_device_uuid) {
records[i].flags |= IREE_HAL_PROFILE_DEVICE_FLAG_PHYSICAL_DEVICE_UUID;
memcpy(records[i].physical_device_uuid,
physical_device->physical_device_uuid,
sizeof(records[i].physical_device_uuid));
}
}
if (iree_status_is_ok(status)) {
iree_hal_profile_chunk_metadata_t metadata =
iree_hal_profile_chunk_metadata_default();
metadata.content_type = IREE_HAL_PROFILE_CONTENT_TYPE_DEVICES;
metadata.name = logical_device->identifier;
metadata.session_id = session_id;
iree_const_byte_span_t iovec =
iree_make_const_byte_span(records, records_size);
status = iree_hal_profile_sink_write(sink, &metadata, 1, &iovec);
}
iree_allocator_free(logical_device->host_allocator, records);
IREE_TRACE_ZONE_END(z0);
return status;
}
static iree_status_t iree_hal_amdgpu_logical_device_write_profile_queues(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_profile_sink_t* sink, uint64_t session_id) {
IREE_TRACE_ZONE_BEGIN(z0);
iree_host_size_t record_count = 0;
for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
if (IREE_UNLIKELY(!iree_host_size_checked_add(
record_count, physical_device->host_queue_count, &record_count))) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
"profile queue metadata count overflow");
}
}
if (record_count == 0) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(
IREE_STATUS_INTERNAL,
"logical device has no host queues (initialization incomplete)");
}
iree_host_size_t records_size = 0;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, IREE_STRUCT_LAYOUT(
0, &records_size,
IREE_STRUCT_FIELD(record_count, iree_hal_profile_queue_record_t,
NULL)));
iree_hal_profile_queue_record_t* records = NULL;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_allocator_malloc(logical_device->host_allocator, records_size,
(void**)&records));
iree_status_t status = iree_ok_status();
iree_host_size_t record_ordinal = 0;
for (iree_host_size_t i = 0;
i < logical_device->physical_device_count && iree_status_is_ok(status);
++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
if (IREE_UNLIKELY(physical_device->device_ordinal > UINT32_MAX)) {
status = iree_make_status(IREE_STATUS_OUT_OF_RANGE,
"profile queue metadata physical device "
"ordinal out of range: %" PRIhsz,
physical_device->device_ordinal);
break;
}
const uint32_t physical_device_ordinal =
(uint32_t)physical_device->device_ordinal;
for (iree_host_size_t j = 0;
j < physical_device->host_queue_count && iree_status_is_ok(status);
++j) {
if (IREE_UNLIKELY(j > UINT32_MAX)) {
status = iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"profile queue metadata queue ordinal out of range: %" PRIhsz, j);
break;
}
const uint32_t queue_ordinal = (uint32_t)j;
records[record_ordinal] = iree_hal_profile_queue_record_default();
records[record_ordinal].physical_device_ordinal = physical_device_ordinal;
records[record_ordinal].queue_ordinal = queue_ordinal;
records[record_ordinal].stream_id =
iree_hal_amdgpu_logical_device_profile_queue_stream_id(
physical_device_ordinal, queue_ordinal);
++record_ordinal;
}
}
if (iree_status_is_ok(status)) {
iree_hal_profile_chunk_metadata_t metadata =
iree_hal_profile_chunk_metadata_default();
metadata.content_type = IREE_HAL_PROFILE_CONTENT_TYPE_QUEUES;
metadata.name = logical_device->identifier;
metadata.session_id = session_id;
iree_const_byte_span_t iovec =
iree_make_const_byte_span(records, records_size);
status = iree_hal_profile_sink_write(sink, &metadata, 1, &iovec);
}
iree_allocator_free(logical_device->host_allocator, records);
IREE_TRACE_ZONE_END(z0);
return status;
}
static iree_status_t
iree_hal_amdgpu_logical_device_write_profile_clock_correlations(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_profile_sink_t* sink, uint64_t session_id) {
IREE_TRACE_ZONE_BEGIN(z0);
const iree_host_size_t record_count = logical_device->physical_device_count;
if (record_count == 0) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(
IREE_STATUS_INTERNAL,
"logical device has no physical devices (initialization incomplete)");
}
iree_host_size_t records_size = 0;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, IREE_STRUCT_LAYOUT(
0, &records_size,
IREE_STRUCT_FIELD(record_count,
iree_hal_profile_clock_correlation_record_t,
NULL)));
iree_hal_profile_clock_correlation_record_t* records = NULL;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_allocator_malloc(logical_device->host_allocator, records_size,
(void**)&records));
iree_status_t status = iree_ok_status();
for (iree_host_size_t i = 0; i < record_count && iree_status_is_ok(status);
++i) {
status = iree_hal_amdgpu_logical_device_sample_profile_clock_correlation(
logical_device, logical_device->physical_devices[i], &records[i]);
}
if (iree_status_is_ok(status)) {
iree_hal_profile_chunk_metadata_t metadata =
iree_hal_profile_chunk_metadata_default();
metadata.content_type = IREE_HAL_PROFILE_CONTENT_TYPE_CLOCK_CORRELATIONS;
metadata.name = logical_device->identifier;
metadata.session_id = session_id;
iree_const_byte_span_t iovec =
iree_make_const_byte_span(records, records_size);
status = iree_hal_profile_sink_write(sink, &metadata, 1, &iovec);
}
iree_allocator_free(logical_device->host_allocator, records);
IREE_TRACE_ZONE_END(z0);
return status;
}
static bool iree_hal_amdgpu_logical_device_profile_needs_executable_artifacts(
iree_hal_device_profiling_data_families_t data_families) {
return iree_any_bit_set(data_families,
IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_METADATA |
IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_TRACES);
}
static iree_status_t iree_hal_amdgpu_logical_device_write_profile_metadata(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_profile_sink_t* sink, uint64_t session_id,
iree_hal_device_profiling_data_families_t data_families) {
const bool emit_executable_artifacts =
iree_hal_amdgpu_logical_device_profile_needs_executable_artifacts(
data_families);
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_write_profile_devices(
logical_device, sink, session_id));
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_write_profile_queues(
logical_device, sink, session_id));
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_profile_metadata_write(
&logical_device->profile_metadata, sink, session_id,
logical_device->identifier, emit_executable_artifacts,
&logical_device->profiling.metadata_cursor));
return iree_hal_amdgpu_logical_device_write_profile_clock_correlations(
logical_device, sink, session_id);
}
static iree_status_t iree_hal_amdgpu_logical_device_write_profile_events(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_profile_sink_t* sink, uint64_t session_id) {
IREE_TRACE_ZONE_BEGIN(z0);
iree_status_t status = iree_hal_amdgpu_profile_event_streams_write_queue(
&logical_device->profiling.event_streams, sink, session_id,
logical_device->host_allocator);
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_profile_event_streams_write_memory(
&logical_device->profiling.event_streams, sink, session_id,
logical_device->host_allocator);
}
for (iree_host_size_t i = 0;
i < logical_device->physical_device_count && iree_status_is_ok(status);
++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
for (iree_host_size_t j = 0;
j < physical_device->host_queue_count && iree_status_is_ok(status);
++j) {
status = iree_hal_amdgpu_host_queue_write_profile_events(
&physical_device->host_queues[j], sink, session_id);
}
}
IREE_TRACE_ZONE_END(z0);
return status;
}
static iree_hal_amdgpu_host_queue_profile_flags_t
iree_hal_amdgpu_logical_device_queue_profile_flags(
const iree_hal_device_profiling_options_t* options) {
iree_hal_amdgpu_host_queue_profile_flags_t flags =
IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_NONE;
if (iree_hal_device_profiling_options_requests_data(
options, IREE_HAL_DEVICE_PROFILING_DATA_QUEUE_EVENTS)) {
flags |= IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_QUEUE_EVENTS;
}
if (iree_hal_device_profiling_options_requests_data(
options, IREE_HAL_DEVICE_PROFILING_DATA_DEVICE_QUEUE_EVENTS)) {
flags |= IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_QUEUE_DEVICE_EVENTS;
}
if (iree_any_bit_set(options->data_families,
IREE_HAL_DEVICE_PROFILING_DATA_DISPATCH_EVENTS |
IREE_HAL_DEVICE_PROFILING_DATA_COUNTER_SAMPLES |
IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_TRACES)) {
flags |= IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_DISPATCHES;
}
return flags;
}
static void iree_hal_amdgpu_logical_device_set_queue_profiling_enabled(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_amdgpu_host_queue_profile_flags_t flags) {
for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
for (iree_host_size_t j = 0; j < physical_device->host_queue_count; ++j) {
iree_hal_amdgpu_host_queue_set_profile_flags(
&physical_device->host_queues[j], flags);
}
}
}
static iree_status_t iree_hal_amdgpu_logical_device_set_hsa_profiling_enabled(
iree_hal_amdgpu_logical_device_t* logical_device, bool enabled) {
IREE_TRACE_ZONE_BEGIN(z0);
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, enabled ? 1 : 0);
iree_status_t status = iree_ok_status();
iree_host_size_t changed_count = 0;
for (iree_host_size_t i = 0;
i < logical_device->physical_device_count && iree_status_is_ok(status);
++i) {
status = iree_hal_amdgpu_physical_device_set_hsa_profiling_enabled(
logical_device->physical_devices[i], enabled);
if (iree_status_is_ok(status)) {
++changed_count;
}
}
if (!iree_status_is_ok(status) && enabled) {
for (iree_host_size_t i = 0; i < changed_count; ++i) {
status = iree_status_join(
status, iree_hal_amdgpu_physical_device_set_hsa_profiling_enabled(
logical_device->physical_devices[i], false));
}
} else if (!enabled) {
for (iree_host_size_t i = changed_count;
i < logical_device->physical_device_count; ++i) {
status = iree_status_join(
status, iree_hal_amdgpu_physical_device_set_hsa_profiling_enabled(
logical_device->physical_devices[i], false));
}
}
IREE_TRACE_ZONE_END(z0);
return status;
}
static iree_status_t
iree_hal_amdgpu_logical_device_set_counter_profiling_enabled(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_amdgpu_profile_counter_session_t* counter_session, bool enabled) {
if (!iree_hal_amdgpu_profile_counter_session_is_active(counter_session)) {
return iree_ok_status();
}
IREE_TRACE_ZONE_BEGIN(z0);
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, enabled ? 1 : 0);
iree_status_t status = iree_ok_status();
iree_host_size_t changed_queue_count = 0;
for (iree_host_size_t i = 0;
i < logical_device->physical_device_count && iree_status_is_ok(status);
++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
for (iree_host_size_t j = 0;
j < physical_device->host_queue_count && iree_status_is_ok(status);
++j) {
iree_hal_amdgpu_host_queue_t* queue = &physical_device->host_queues[j];
if (enabled) {
status = iree_hal_amdgpu_host_queue_enable_profile_counters(
queue, counter_session);
if (iree_status_is_ok(status)) {
++changed_queue_count;
}
} else {
iree_hal_amdgpu_host_queue_disable_profile_counters(queue);
}
}
}
if (!iree_status_is_ok(status) && enabled) {
for (iree_host_size_t i = 0, seen_queue_count = 0;
i < logical_device->physical_device_count &&
seen_queue_count < changed_queue_count;
++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
for (iree_host_size_t j = 0; j < physical_device->host_queue_count &&
seen_queue_count < changed_queue_count;
++j, ++seen_queue_count) {
iree_hal_amdgpu_host_queue_disable_profile_counters(
&physical_device->host_queues[j]);
}
}
}
IREE_TRACE_ZONE_END(z0);
return status;
}
static iree_status_t iree_hal_amdgpu_logical_device_set_trace_profiling_enabled(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_amdgpu_profile_trace_session_t* trace_session, bool enabled) {
if (!iree_hal_amdgpu_profile_trace_session_is_active(trace_session)) {
return iree_ok_status();
}
IREE_TRACE_ZONE_BEGIN(z0);
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, enabled ? 1 : 0);
iree_status_t status = iree_ok_status();
iree_host_size_t changed_queue_count = 0;
for (iree_host_size_t i = 0;
i < logical_device->physical_device_count && iree_status_is_ok(status);
++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
for (iree_host_size_t j = 0;
j < physical_device->host_queue_count && iree_status_is_ok(status);
++j) {
iree_hal_amdgpu_host_queue_t* queue = &physical_device->host_queues[j];
if (enabled) {
status = iree_hal_amdgpu_host_queue_enable_profile_traces(
queue, trace_session);
if (iree_status_is_ok(status)) {
++changed_queue_count;
}
} else {
iree_hal_amdgpu_host_queue_disable_profile_traces(queue);
}
}
}
if (!iree_status_is_ok(status) && enabled) {
for (iree_host_size_t i = 0, seen_queue_count = 0;
i < logical_device->physical_device_count &&
seen_queue_count < changed_queue_count;
++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
for (iree_host_size_t j = 0; j < physical_device->host_queue_count &&
seen_queue_count < changed_queue_count;
++j, ++seen_queue_count) {
iree_hal_amdgpu_host_queue_disable_profile_traces(
&physical_device->host_queues[j]);
}
}
}
IREE_TRACE_ZONE_END(z0);
return status;
}
// Selects one host queue from |queue_affinity| after intersecting with this
// logical device's supported queues. The current policy is deterministic
// first-set-bit selection, which is enough to honor explicit HIP stream
// affinities and keeps the CTS path stable. A multi-bit affinity therefore acts
// as "any of these queues"; queue_flush handles multi-bit masks by iterating
// all selected queues instead.
static iree_status_t iree_hal_amdgpu_logical_device_select_host_queue(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_queue_affinity_t queue_affinity,
iree_hal_amdgpu_virtual_queue_t** out_queue) {
IREE_ASSERT_ARGUMENT(logical_device);
IREE_ASSERT_ARGUMENT(out_queue);
*out_queue = NULL;
iree_hal_amdgpu_queue_affinity_resolved_t resolved;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_resolve(
iree_hal_amdgpu_logical_device_queue_affinity_domain(logical_device),
queue_affinity, &resolved));
return iree_hal_amdgpu_logical_device_queue_from_ordinal(
logical_device, resolved.queue_ordinal, out_queue);
}
// Selects the physical device backing |queue_affinity| for pool creation.
//
// Queue pools are scoped to one physical memory domain, but |queue_affinity|
// still has the usual "any queue in this mask" meaning. This helper therefore
// collapses multi-bit masks with the same deterministic first-set-bit policy as
// host queue submission. In practice IREE_HAL_QUEUE_AFFINITY_ANY usually
// selects queue 0 after intersecting with this device's supported queue mask.
static iree_status_t
iree_hal_amdgpu_logical_device_select_queue_pool_physical_device(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_queue_affinity_t queue_affinity,
iree_hal_amdgpu_physical_device_t** out_physical_device) {
IREE_ASSERT_ARGUMENT(logical_device);
IREE_ASSERT_ARGUMENT(out_physical_device);
*out_physical_device = NULL;
iree_hal_amdgpu_queue_affinity_resolved_t resolved;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_resolve(
iree_hal_amdgpu_logical_device_queue_affinity_domain(logical_device),
queue_affinity, &resolved));
*out_physical_device =
logical_device->physical_devices[resolved.physical_device_ordinal];
return iree_ok_status();
}
// Normalizes command-buffer queue affinity to queues on one physical device and
// returns the physical device ordinal whose executable kernel objects may be
// baked into the recorded command stream.
static iree_status_t
iree_hal_amdgpu_logical_device_normalize_command_buffer_affinity(
iree_hal_amdgpu_logical_device_t* logical_device,
iree_hal_queue_affinity_t queue_affinity,
iree_hal_queue_affinity_t* out_queue_affinity,
iree_host_size_t* out_device_ordinal) {
*out_queue_affinity = 0;
*out_device_ordinal = 0;
return iree_hal_amdgpu_queue_affinity_normalize_for_physical_device(
iree_hal_amdgpu_logical_device_queue_affinity_domain(logical_device),
queue_affinity, out_queue_affinity, out_device_ordinal);
}
static bool iree_hal_amdgpu_logical_device_query_pool_epoch(
void* user_data, iree_async_axis_t axis, uint64_t epoch) {
iree_hal_amdgpu_logical_device_t* logical_device =
(iree_hal_amdgpu_logical_device_t*)user_data;
hsa_signal_t epoch_signal = {0};
if (!iree_hal_amdgpu_epoch_signal_table_lookup(
logical_device->host_queue_epoch_table, axis, &epoch_signal)) {
return false;
}
iree_amd_signal_t* signal =
(iree_amd_signal_t*)(uintptr_t)epoch_signal.handle;
const iree_hsa_signal_value_t current_value = iree_atomic_load(
(iree_atomic_int64_t*)&signal->value, iree_memory_order_acquire);
if (IREE_UNLIKELY(current_value < 0 ||
current_value > IREE_HAL_AMDGPU_EPOCH_INITIAL_VALUE)) {
return false;
}
const uint64_t current_epoch =
(uint64_t)IREE_HAL_AMDGPU_EPOCH_INITIAL_VALUE - (uint64_t)current_value;
return current_epoch >= epoch;
}
static void iree_hal_amdgpu_logical_device_deassign_frontier(
iree_hal_amdgpu_logical_device_t* logical_device) {
IREE_TRACE_ZONE_BEGIN(z0);
for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
iree_hal_amdgpu_physical_device_deassign_frontier(
logical_device->physical_devices[i]);
}
iree_async_frontier_tracker_release(logical_device->frontier_tracker);
logical_device->frontier_tracker = NULL;
logical_device->axis = 0;
memset(&logical_device->topology_info, 0,
sizeof(logical_device->topology_info));
if (logical_device->host_queue_epoch_table) {
iree_allocator_free(logical_device->host_allocator,
logical_device->host_queue_epoch_table);
logical_device->host_queue_epoch_table = NULL;
}
IREE_TRACE_ZONE_END(z0);
}
static void iree_hal_amdgpu_logical_device_error_handler(void* user_data,
iree_status_t status) {
iree_hal_amdgpu_logical_device_t* logical_device =
(iree_hal_amdgpu_logical_device_t*)user_data;
IREE_TRACE_ZONE_BEGIN(z0);
// Display the error in trace tooling.
IREE_TRACE({
char buffer[1024];
iree_host_size_t buffer_length = 0;
if (iree_status_format(status, sizeof(buffer), buffer, &buffer_length)) {
IREE_TRACE_MESSAGE_DYNAMIC(ERROR, buffer, buffer_length);
}
});
// Set the device sticky error status (if it is not already set).
intptr_t current_value = 0;
if (!iree_atomic_compare_exchange_strong(
&logical_device->failure_status, &current_value, (intptr_t)status,
iree_memory_order_acq_rel, iree_memory_order_relaxed)) {
// Previous status was not OK; the sticky slot owns only the first failure.
iree_status_free(status);
}
IREE_TRACE_ZONE_END(z0);
}
static void iree_hal_amdgpu_logical_device_translate_physical_options(
const iree_hal_amdgpu_logical_device_options_t* options,
const iree_hal_amdgpu_topology_t* topology,
iree_hal_amdgpu_physical_device_options_t* out_options) {
iree_hal_amdgpu_physical_device_options_initialize(out_options);
out_options->device_block_pools.small.block_size =
options->device_block_pools.small.block_size;
out_options->device_block_pools.small.initial_capacity =
options->device_block_pools.small.initial_capacity;
out_options->device_block_pools.large.block_size =
options->device_block_pools.large.block_size;
out_options->device_block_pools.large.initial_capacity =
options->device_block_pools.large.initial_capacity;
out_options->default_pool.range_length = options->default_pool.range_length;
out_options->default_pool.alignment = options->default_pool.alignment;
out_options->default_pool.frontier_capacity =
options->default_pool.frontier_capacity;
out_options->host_block_pool_initial_capacity =
options->preallocate_pools ? 16 : 0;
out_options->host_queue_count = topology->gpu_agent_queue_count;
out_options->host_queue_aql_capacity = options->host_queues.aql_capacity;
out_options->host_queue_notification_capacity =
options->host_queues.notification_capacity;
out_options->host_queue_kernarg_capacity =
options->host_queues.kernarg_capacity;
out_options->force_wait_barrier_defer = options->force_wait_barrier_defer;
}
static iree_status_t iree_hal_amdgpu_logical_device_verify_physical_options(
const iree_hal_amdgpu_physical_device_options_t* options,
const iree_hal_amdgpu_libhsa_t* libhsa,
const iree_hal_amdgpu_topology_t* topology) {
for (iree_host_size_t i = 0; i < topology->gpu_agent_count; ++i) {
hsa_agent_t gpu_agent = topology->gpu_agents[i];
hsa_agent_t cpu_agent = topology->cpu_agents[topology->gpu_cpu_map[i]];
IREE_RETURN_IF_ERROR(
iree_hal_amdgpu_physical_device_options_verify(options, libhsa,
cpu_agent, gpu_agent),
"verifying GPU agent %" PRIhsz " meets required options", i);
}
return iree_ok_status();
}
static iree_status_t iree_hal_amdgpu_logical_device_allocate_storage(
iree_string_view_t identifier, const iree_hal_amdgpu_topology_t* topology,
iree_host_size_t physical_device_size, iree_allocator_t host_allocator,
iree_hal_amdgpu_logical_device_t** out_logical_device) {
*out_logical_device = NULL;
iree_hal_amdgpu_logical_device_t* logical_device = NULL;
iree_host_size_t physical_device_data_offset = 0;
iree_host_size_t identifier_offset = 0;
iree_host_size_t total_size = 0;
IREE_RETURN_IF_ERROR(IREE_STRUCT_LAYOUT(
sizeof(*logical_device), &total_size,
IREE_STRUCT_FIELD(topology->gpu_agent_count,
iree_hal_amdgpu_physical_device_t*, NULL),
IREE_STRUCT_ARRAY_FIELD_ALIGNED(
topology->gpu_agent_count, physical_device_size, uint8_t,
iree_max_align_t, &physical_device_data_offset),
IREE_STRUCT_FIELD(identifier.size, char, &identifier_offset)));
const iree_hal_amdgpu_queue_affinity_domain_t queue_affinity_domain = {
.supported_affinity = IREE_HAL_QUEUE_AFFINITY_ANY,
.physical_device_count = topology->gpu_agent_count,
.queue_count_per_physical_device = topology->gpu_agent_queue_count,
};
iree_hal_queue_affinity_t logical_queue_affinity_mask = 0;
for (iree_host_size_t i = 0; i < topology->gpu_agent_count; ++i) {
iree_hal_queue_affinity_t physical_device_affinity = 0;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_for_physical_device(
queue_affinity_domain, i, &physical_device_affinity));
iree_hal_queue_affinity_or_into(logical_queue_affinity_mask,
physical_device_affinity);
}
IREE_RETURN_IF_ERROR(iree_allocator_malloc(host_allocator, total_size,
(void**)&logical_device));
memset(logical_device, 0, total_size);
iree_hal_resource_initialize(&iree_hal_amdgpu_logical_device_vtable,
&logical_device->resource);
iree_string_view_append_to_buffer(identifier, &logical_device->identifier,
(char*)logical_device + identifier_offset);
logical_device->host_allocator = host_allocator;
logical_device->failure_status = IREE_ATOMIC_VAR_INIT(0);
iree_atomic_store(&logical_device->epoch, 0, iree_memory_order_relaxed);
logical_device->next_profile_session_id = 1;
iree_hal_amdgpu_profile_metadata_initialize(
host_allocator, &logical_device->profile_metadata);
iree_hal_amdgpu_profile_event_streams_initialize(
&logical_device->profiling.event_streams);
// Setup physical device table first so failure cleanup has a valid table.
logical_device->physical_device_count = topology->gpu_agent_count;
logical_device->queue_affinity_mask = logical_queue_affinity_mask;
uint8_t* physical_device_base =
(uint8_t*)logical_device + physical_device_data_offset;
for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
logical_device->physical_devices[i] =
(iree_hal_amdgpu_physical_device_t*)physical_device_base;
physical_device_base += physical_device_size;
}
*out_logical_device = logical_device;
return iree_ok_status();
}
static iree_status_t iree_hal_amdgpu_logical_device_initialize_host_resources(
iree_hal_amdgpu_logical_device_t* logical_device,
const iree_hal_amdgpu_logical_device_options_t* options,
iree_async_proactor_pool_t* proactor_pool,
iree_allocator_t host_allocator) {
logical_device->proactor_pool = proactor_pool;
iree_async_proactor_pool_retain(logical_device->proactor_pool);
iree_arena_block_pool_initialize(options->host_block_pools.small.block_size,
host_allocator,
&logical_device->host_block_pools.small);
iree_arena_block_pool_initialize(options->host_block_pools.large.block_size,
host_allocator,
&logical_device->host_block_pools.large);
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_aql_program_block_pool_initialize(
options->host_block_pools.command_buffer.usable_block_size,
host_allocator, &logical_device->host_block_pools.command_buffer));
return iree_async_proactor_pool_get(logical_device->proactor_pool, 0,
&logical_device->proactor);
}
static iree_status_t
iree_hal_amdgpu_logical_device_initialize_system_and_allocator(
iree_hal_amdgpu_logical_device_t* logical_device,
const iree_hal_amdgpu_logical_device_options_t* options,
const iree_hal_amdgpu_libhsa_t* libhsa,
const iree_hal_amdgpu_topology_t* topology,
iree_allocator_t host_allocator) {
iree_hal_amdgpu_system_options_t system_options = {
.exclusive_execution = options->exclusive_execution,
};
IREE_RETURN_IF_ERROR(
iree_hal_amdgpu_system_allocate(libhsa, topology, system_options,
host_allocator, &logical_device->system));
return iree_hal_amdgpu_allocator_create(
logical_device, &logical_device->system->libhsa,
&logical_device->system->topology, host_allocator,
&logical_device->device_allocator);
}
static iree_status_t iree_hal_amdgpu_logical_device_initialize_physical_devices(
iree_hal_amdgpu_logical_device_t* logical_device,
const iree_hal_amdgpu_topology_t* topology,
const iree_hal_amdgpu_physical_device_options_t* options,
iree_allocator_t host_allocator) {
for (iree_host_size_t device_ordinal = 0;
device_ordinal < logical_device->physical_device_count;
++device_ordinal) {
const iree_host_size_t host_ordinal = topology->gpu_cpu_map[device_ordinal];
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_physical_device_initialize(
(iree_hal_device_t*)logical_device, logical_device->system, options,
logical_device->proactor, host_ordinal,
&logical_device->system->host_memory_pools[host_ordinal],
device_ordinal, host_allocator,
logical_device->physical_devices[device_ordinal]));
}
return iree_ok_status();
}
static iree_status_t iree_hal_amdgpu_logical_device_warmup_host_pools(
iree_hal_amdgpu_logical_device_t* logical_device) {
IREE_RETURN_IF_ERROR(iree_arena_block_pool_preallocate(
&logical_device->host_block_pools.small, 16));
IREE_RETURN_IF_ERROR(iree_arena_block_pool_preallocate(
&logical_device->host_block_pools.large, 16));
return iree_arena_block_pool_preallocate(
&logical_device->host_block_pools.command_buffer, 16);
}
iree_status_t iree_hal_amdgpu_logical_device_create(
iree_string_view_t identifier,
const iree_hal_amdgpu_logical_device_options_t* options,
const iree_hal_amdgpu_libhsa_t* libhsa,
const iree_hal_amdgpu_topology_t* topology,
const iree_hal_device_create_params_t* create_params,
iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
IREE_ASSERT_ARGUMENT(options);
IREE_ASSERT_ARGUMENT(create_params);
IREE_ASSERT_ARGUMENT(create_params->proactor_pool);
IREE_ASSERT_ARGUMENT(out_device);
IREE_TRACE_ZONE_BEGIN(z0);
*out_device = NULL;
// Verify the topology is valid for a logical device.
// This may have already been performed by the caller but doing it here
// ensures all code paths must verify prior to creating a device.
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_amdgpu_topology_verify(topology, libhsa),
"verifying topology");
// Verify the parameters prior to creating resources.
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0,
iree_hal_amdgpu_logical_device_options_verify(options, libhsa, topology),
"verifying logical device options");
iree_hal_amdgpu_physical_device_options_t physical_device_options = {0};
iree_hal_amdgpu_logical_device_translate_physical_options(
options, topology, &physical_device_options);
// Verify all GPU agents meet the required physical device options. Each
// embedded physical device has the same layout because all physical devices
// in one logical device share the same host-queue options.
const iree_host_size_t physical_device_size =
iree_hal_amdgpu_physical_device_calculate_size(&physical_device_options);
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0,
iree_hal_amdgpu_logical_device_verify_physical_options(
&physical_device_options, libhsa, topology),
"verifying physical device options");
// Allocate the logical device and all nested physical device data structures.
iree_hal_amdgpu_logical_device_t* logical_device = NULL;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_amdgpu_logical_device_allocate_storage(
identifier, topology, physical_device_size, host_allocator,
&logical_device));
iree_status_t status =
iree_hal_amdgpu_logical_device_initialize_host_resources(
logical_device, options, create_params->proactor_pool,
host_allocator);
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_logical_device_initialize_system_and_allocator(
logical_device, options, libhsa, topology, host_allocator);
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_logical_device_initialize_physical_devices(
logical_device, topology, &physical_device_options, host_allocator);
}
// If requested then warmup pools that we expect to grow on the first usage of
// the backend. The first use may need more than the warmup provides here but
// that's ok - users can warmup if they want.
if (iree_status_is_ok(status) && options->preallocate_pools) {
status = iree_hal_amdgpu_logical_device_warmup_host_pools(logical_device);
}
if (iree_status_is_ok(status)) {
*out_device = (iree_hal_device_t*)logical_device;
} else {
iree_hal_device_release((iree_hal_device_t*)logical_device);
}
IREE_TRACE_ZONE_END(z0);
return status;
}
static void iree_hal_amdgpu_logical_device_destroy(
iree_hal_device_t* base_device) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
IREE_TRACE_ZONE_BEGIN(z0);
iree_hal_amdgpu_profile_counter_session_t* counter_session =
logical_device->profiling.counter_session;
iree_hal_amdgpu_profile_trace_session_t* trace_session =
logical_device->profiling.trace_session;
if (trace_session) {
for (iree_host_size_t i = 0; i < logical_device->physical_device_count;
++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
for (iree_host_size_t j = 0; j < physical_device->host_queue_count; ++j) {
iree_hal_amdgpu_host_queue_disable_profile_traces(
&physical_device->host_queues[j]);
}
}
logical_device->profiling.trace_session = NULL;
iree_hal_amdgpu_profile_trace_session_free(trace_session);
}
if (counter_session) {
for (iree_host_size_t i = 0; i < logical_device->physical_device_count;
++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
for (iree_host_size_t j = 0; j < physical_device->host_queue_count; ++j) {
iree_hal_amdgpu_host_queue_disable_profile_counters(
&physical_device->host_queues[j]);
}
}
logical_device->profiling.counter_session = NULL;
iree_hal_amdgpu_profile_counter_session_free(counter_session);
}
iree_hal_amdgpu_logical_device_reset_profile_options(logical_device);
logical_device->profiling.session_id = 0;
iree_hal_amdgpu_profile_event_streams_deinitialize(
&logical_device->profiling.event_streams, logical_device->host_allocator);
iree_hal_amdgpu_logical_device_deassign_frontier(logical_device);
// Devices may hold allocations and need to be cleaned up first.
for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
iree_hal_amdgpu_physical_device_deinitialize(
logical_device->physical_devices[i]);
}
iree_hal_allocator_release(logical_device->device_allocator);
iree_hal_channel_provider_release(logical_device->channel_provider);
// This may unload HSA; must come after all resources are released.
iree_hal_amdgpu_system_free(logical_device->system);
iree_hal_amdgpu_profile_metadata_deinitialize(
&logical_device->profile_metadata);
// Note that these may be used by other child data types and must be freed
// last.
iree_arena_block_pool_deinitialize(&logical_device->host_block_pools.small);
iree_arena_block_pool_deinitialize(&logical_device->host_block_pools.large);
iree_arena_block_pool_deinitialize(
&logical_device->host_block_pools.command_buffer);
iree_async_proactor_pool_release(logical_device->proactor_pool);
iree_allocator_free(host_allocator, logical_device);
IREE_TRACE_ZONE_END(z0);
}
static iree_string_view_t iree_hal_amdgpu_logical_device_id(
iree_hal_device_t* base_device) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
return logical_device->identifier;
}
static iree_allocator_t iree_hal_amdgpu_logical_device_host_allocator(
iree_hal_device_t* base_device) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
return logical_device->host_allocator;
}
static iree_hal_allocator_t* iree_hal_amdgpu_logical_device_allocator(
iree_hal_device_t* base_device) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
return logical_device->device_allocator;
}
static void iree_hal_amdgpu_replace_device_allocator(
iree_hal_device_t* base_device, iree_hal_allocator_t* new_allocator) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_allocator_retain(new_allocator);
iree_hal_allocator_release(logical_device->device_allocator);
logical_device->device_allocator = new_allocator;
}
static void iree_hal_amdgpu_replace_channel_provider(
iree_hal_device_t* base_device, iree_hal_channel_provider_t* new_provider) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_channel_provider_retain(new_provider);
iree_hal_channel_provider_release(logical_device->channel_provider);
logical_device->channel_provider = new_provider;
}
static iree_status_t iree_hal_amdgpu_logical_device_trim(
iree_hal_device_t* base_device) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
// Release pooled resources from each physical device. These may return items
// back to the parent logical device pools.
for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_physical_device_trim(
logical_device->physical_devices[i]));
}
// Trim the allocator pools, if any.
IREE_RETURN_IF_ERROR(
iree_hal_allocator_trim(logical_device->device_allocator));
// Trim host pools.
iree_arena_block_pool_trim(&logical_device->host_block_pools.small);
iree_arena_block_pool_trim(&logical_device->host_block_pools.large);
iree_arena_block_pool_trim(&logical_device->host_block_pools.command_buffer);
return iree_ok_status();
}
static iree_status_t iree_hal_amdgpu_logical_device_query_i64(
iree_hal_device_t* base_device, iree_string_view_t category,
iree_string_view_t key, int64_t* out_value) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
*out_value = 0;
if (iree_string_view_equal(category, IREE_SV("hal.device.id"))) {
// NOTE: this is a fuzzy match and can allow a program to work with multiple
// device implementations.
*out_value =
iree_string_view_match_pattern(logical_device->identifier, key) ? 1 : 0;
return iree_ok_status();
}
iree_hal_amdgpu_system_t* system = logical_device->system;
if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) {
bool is_supported = false;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_format_supported(
&system->libhsa, system->topology.gpu_agents[0], key, &is_supported,
/*out_isa=*/NULL));
*out_value = is_supported ? 1 : 0;
return iree_ok_status();
}
if (iree_string_view_equal(category, IREE_SV("hal.device"))) {
if (iree_string_view_equal(key, IREE_SV("concurrency"))) {
*out_value = system->topology.gpu_agent_count *
system->topology.gpu_agent_queue_count;
return iree_ok_status();
}
} else if (iree_string_view_equal(category, IREE_SV("hal.dispatch"))) {
if (iree_string_view_equal(key, IREE_SV("concurrency"))) {
uint32_t compute_unit_count = 0;
IREE_RETURN_IF_ERROR(iree_hsa_agent_get_info(
IREE_LIBHSA(&system->libhsa), system->topology.gpu_agents[0],
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
&compute_unit_count));
*out_value = compute_unit_count;
return iree_ok_status();
}
}
return iree_make_status(
IREE_STATUS_NOT_FOUND,
"unknown device configuration key value '%.*s :: %.*s'",
(int)category.size, category.data, (int)key.size, key.data);
}
static iree_status_t iree_hal_amdgpu_logical_device_query_capabilities(
iree_hal_device_t* base_device,
iree_hal_device_capabilities_t* out_capabilities) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
memset(out_capabilities, 0, sizeof(*out_capabilities));
if (logical_device->physical_device_count == 0) {
return iree_make_status(
IREE_STATUS_INTERNAL,
"logical device has no physical devices (initialization incomplete)");
}
// A multi-GPU logical device is a composite HAL device. Generic HAL topology
// has only one node for it, so do not expose a physical-device-0 identity as
// though it represented the entire composite. Exact internal physical device
// identity is reported through AMDGPU profile/device metadata and queue
// affinity records.
const bool is_composite_device = logical_device->physical_device_count > 1;
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[0];
memset(out_capabilities->physical_device_uuid, 0,
sizeof(out_capabilities->physical_device_uuid));
if (!is_composite_device && physical_device->has_physical_device_uuid) {
memcpy(out_capabilities->physical_device_uuid,
physical_device->physical_device_uuid,
sizeof(out_capabilities->physical_device_uuid));
out_capabilities->has_physical_device_uuid = true;
}
// Report a NUMA affinity only when the composite has a single nearest host
// node that fits the generic HAL uint8_t representation. Mixed-NUMA
// composites intentionally leave the default 0 because generic topology
// cannot express one logical device spanning multiple CPU NUMA nodes.
uint32_t host_numa_node = physical_device->host_numa_node;
bool has_representative_numa_node = host_numa_node <= UINT8_MAX;
for (iree_host_size_t i = 1; i < logical_device->physical_device_count &&
has_representative_numa_node;
++i) {
has_representative_numa_node =
logical_device->physical_devices[i]->host_numa_node == host_numa_node;
}
if (has_representative_numa_node) {
out_capabilities->numa_node = (uint8_t)host_numa_node;
}
// External handle types (DMA-BUF support from system info).
if (logical_device->system->info.dmabuf_supported) {
out_capabilities->buffer_export_types |=
IREE_HAL_TOPOLOGY_HANDLE_TYPE_DMA_BUF;
out_capabilities->buffer_import_types |=
IREE_HAL_TOPOLOGY_HANDLE_TYPE_DMA_BUF;
}
// Capability flags.
if (logical_device->system->info.svm_accessible_by_default) {
out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_UNIFIED_MEMORY;
}
// AMDGPU semaphores are native async timeline semaphores (not binary
// emulation).
out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_TIMELINE_SEMAPHORES;
// Fine-grained memory provides host coherency without explicit flushes.
// Coarse-grained memory requires fences, but the driver manages that
// transparently.
out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_HOST_COHERENT;
// All AMDGPU devices support device-scope atomics. System-scope atomics are
// supported on fine-grained memory when callers explicitly opt into
// host-visible placement.
out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_ATOMIC_SCOPE_DEVICE;
out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_ATOMIC_SCOPE_SYSTEM;
// All AMD GPUs support peer-to-peer DMA (through XGMI or PCIe). The actual
// access mode for a specific GPU pair is determined by
// refine_topology_edge — here we declare the capability in principle.
out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_P2P_COPY;
// Peer addressability depends on whether SVM is enabled (large BAR / XGMI
// provides load/store access to peer memory without explicit grants).
if (logical_device->system->info.svm_accessible_by_default) {
out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_PEER_ADDRESSABLE;
// SVM implies peer coherency on fine-grained memory.
out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_PEER_COHERENT;
}
// Driver handle (HSA agent handle for same-driver refinement). Composite
// devices intentionally leave this unset: a single HSA agent handle would
// make generic topology alias detection treat a composite as one GPU.
if (!is_composite_device) {
out_capabilities->driver_device_handle =
(uintptr_t)physical_device->device_agent.handle;
}
return iree_ok_status();
}
static const iree_hal_device_topology_info_t*
iree_hal_amdgpu_logical_device_topology_info(iree_hal_device_t* base_device) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
return &logical_device->topology_info;
}
// Maximum number of HSA memory-pool link hops we will stack-allocate.
#define IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS 16
typedef struct iree_hal_amdgpu_physical_topology_edge_t {
// Source-agent access to the destination coarse-grained memory pool.
hsa_amd_memory_pool_access_t coarse_access;
// Source-agent access to the destination fine-grained memory pool.
hsa_amd_memory_pool_access_t fine_access;
// True when |coarse_access| permits some direct device access.
bool coarse_accessible;
// True when |fine_access| permits some direct device access.
bool fine_accessible;
// True when every HSA-reported link hop supports coherent transactions.
bool all_hops_coherent;
// True when every HSA-reported link hop supports 32-bit atomics.
bool all_hops_atomic_32bit;
// True when every HSA-reported link hop supports 64-bit atomics.
bool all_hops_atomic_64bit;
// Worst physical link class across the reported HSA link hops.
iree_hal_topology_link_class_t link_class;
// Conservative copy-cost class derived from |link_class|.
uint8_t copy_cost;
// Conservative latency class derived from |link_class|.
uint8_t latency_class;
// Worst normalized NUMA distance reported by HSA link hops.
uint8_t numa_distance;
} iree_hal_amdgpu_physical_topology_edge_t;
typedef struct iree_hal_amdgpu_topology_edge_aggregate_t {
// Conservatively intersected capabilities valid for every physical pair.
iree_hal_topology_capability_t physical_capabilities;
// Worst non-coherent read mode across all physical pairs.
iree_hal_topology_interop_mode_t noncoherent_read_mode;
// Worst non-coherent write mode across all physical pairs.
iree_hal_topology_interop_mode_t noncoherent_write_mode;
// Worst coherent read mode across all physical pairs.
iree_hal_topology_interop_mode_t coherent_read_mode;
// Worst coherent write mode across all physical pairs.
iree_hal_topology_interop_mode_t coherent_write_mode;
// Worst link class across all physical pairs.
iree_hal_topology_link_class_t link_class;
// Worst copy-cost class across all physical pairs.
uint8_t copy_cost;
// Worst latency class across all physical pairs.
uint8_t latency_class;
// Worst normalized NUMA distance across all physical pairs.
uint8_t numa_distance;
} iree_hal_amdgpu_topology_edge_aggregate_t;
// Maps an HSA link type to a HAL topology link class.
// For multi-hop links, the caller should take the worst (highest) class.
static iree_hal_topology_link_class_t iree_hal_amdgpu_link_type_to_link_class(
hsa_amd_link_info_type_t link_type) {
switch (link_type) {
case HSA_AMD_LINK_INFO_TYPE_XGMI:
return IREE_HAL_TOPOLOGY_LINK_CLASS_NVLINK_IF;
case HSA_AMD_LINK_INFO_TYPE_PCIE:
return IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_SAME_ROOT;
case HSA_AMD_LINK_INFO_TYPE_QPI:
case HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT:
// Cross-socket interconnects — treat as cross-root PCIe.
return IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_CROSS_ROOT;
case HSA_AMD_LINK_INFO_TYPE_INFINBAND:
return IREE_HAL_TOPOLOGY_LINK_CLASS_FABRIC;
default:
return IREE_HAL_TOPOLOGY_LINK_CLASS_OTHER;
}
}
static void iree_hal_amdgpu_topology_costs_from_link_class(
iree_hal_topology_link_class_t link_class, uint8_t* out_copy_cost,
uint8_t* out_latency_class) {
switch (link_class) {
case IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE:
*out_copy_cost = 0;
*out_latency_class = 0;
break;
case IREE_HAL_TOPOLOGY_LINK_CLASS_NVLINK_IF:
*out_copy_cost = 3;
*out_latency_class = 3;
break;
case IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_SAME_ROOT:
*out_copy_cost = 7;
*out_latency_class = 7;
break;
case IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_CROSS_ROOT:
*out_copy_cost = 9;
*out_latency_class = 9;
break;
case IREE_HAL_TOPOLOGY_LINK_CLASS_HOST_STAGED:
*out_copy_cost = 13;
*out_latency_class = 11;
break;
case IREE_HAL_TOPOLOGY_LINK_CLASS_FABRIC:
*out_copy_cost = 15;
*out_latency_class = 14;
break;
case IREE_HAL_TOPOLOGY_LINK_CLASS_ISOLATED:
*out_copy_cost = 15;
*out_latency_class = 15;
break;
default:
*out_copy_cost = 11;
*out_latency_class = 10;
break;
}
}
static uint8_t iree_hal_amdgpu_topology_scale_hsa_numa_distance(
uint32_t hsa_numa_distance) {
if (hsa_numa_distance == 0) return 0;
uint32_t scaled = hsa_numa_distance > 10 ? (hsa_numa_distance - 10) / 2 : 0;
return (uint8_t)iree_min(scaled, 15u);
}
static bool iree_hal_amdgpu_memory_pool_access_is_valid(
hsa_amd_memory_pool_access_t access) {
switch (access) {
case HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED:
case HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT:
case HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT:
return true;
default:
return false;
}
}
static iree_status_t iree_hal_amdgpu_validate_memory_pool_access(
hsa_amd_memory_pool_access_t access, const char* pool_kind) {
if (IREE_LIKELY(iree_hal_amdgpu_memory_pool_access_is_valid(access))) {
return iree_ok_status();
}
return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
"HSA reported unknown %s memory pool access mode %u",
pool_kind, (uint32_t)access);
}
static iree_hal_topology_interop_mode_t
iree_hal_amdgpu_topology_mode_from_memory_pool_access(
hsa_amd_memory_pool_access_t access,
iree_hal_topology_interop_mode_t base_mode) {
switch (access) {
case HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED:
return IREE_HAL_TOPOLOGY_INTEROP_MODE_COPY;
case HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT:
return IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
case HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT:
default:
return base_mode;
}
}
static iree_hal_topology_capability_t
iree_hal_amdgpu_physical_topology_capabilities(
const iree_hal_amdgpu_physical_topology_edge_t* physical_edge) {
iree_hal_topology_capability_t capabilities =
IREE_HAL_TOPOLOGY_CAPABILITY_NONE;
if (!physical_edge->coarse_accessible && !physical_edge->fine_accessible) {
return capabilities;
}
capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY;
if (physical_edge->all_hops_coherent) {
capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT;
}
if (physical_edge->all_hops_atomic_32bit) {
capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE;
}
if (physical_edge->all_hops_atomic_64bit) {
capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM;
}
return capabilities;
}
static void iree_hal_amdgpu_physical_topology_edge_initialize(
iree_hal_amdgpu_physical_topology_edge_t* out_physical_edge) {
memset(out_physical_edge, 0, sizeof(*out_physical_edge));
out_physical_edge->coarse_access = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
out_physical_edge->fine_access = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
out_physical_edge->all_hops_coherent = true;
out_physical_edge->all_hops_atomic_32bit = true;
out_physical_edge->all_hops_atomic_64bit = true;
out_physical_edge->link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE;
}
static iree_status_t iree_hal_amdgpu_query_physical_topology_edge(
const iree_hal_amdgpu_libhsa_t* libhsa,
const iree_hal_amdgpu_physical_device_t* source_physical_device,
const iree_hal_amdgpu_physical_device_t* destination_physical_device,
iree_hal_amdgpu_physical_topology_edge_t* out_physical_edge) {
iree_hal_amdgpu_physical_topology_edge_initialize(out_physical_edge);
hsa_agent_t source_agent = source_physical_device->device_agent;
hsa_agent_t destination_agent = destination_physical_device->device_agent;
// Find both memory pool types on the destination agent. Not all devices
// expose both pool types; missing pools are treated as NEVER_ALLOWED for that
// pool kind, but an agent with no global pool at all is not a usable topology
// node.
hsa_amd_memory_pool_t dst_coarse_pool = {0};
bool has_coarse_pool = iree_hal_amdgpu_try_find_coarse_global_memory_pool(
libhsa, destination_agent, &dst_coarse_pool);
hsa_amd_memory_pool_t dst_fine_pool = {0};
bool has_fine_pool = iree_hal_amdgpu_try_find_fine_global_memory_pool(
libhsa, destination_agent, &dst_fine_pool);
if (!has_coarse_pool && !has_fine_pool) {
return iree_make_status(
IREE_STATUS_UNAVAILABLE,
"destination agent has neither coarse nor fine global memory pool");
}
if (has_coarse_pool) {
IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info(
IREE_LIBHSA(libhsa), source_agent, dst_coarse_pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&out_physical_edge->coarse_access));
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_validate_memory_pool_access(
out_physical_edge->coarse_access, "coarse"));
}
if (has_fine_pool) {
IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info(
IREE_LIBHSA(libhsa), source_agent, dst_fine_pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&out_physical_edge->fine_access));
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_validate_memory_pool_access(
out_physical_edge->fine_access, "fine"));
}
out_physical_edge->coarse_accessible =
out_physical_edge->coarse_access !=
HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
out_physical_edge->fine_accessible = out_physical_edge->fine_access !=
HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
// Query link hop count and topology. The link topology describes the
// interconnect between agents and is the same regardless of pool granularity;
// use whichever pool is present, preferring coarse-grained memory.
hsa_amd_memory_pool_t link_query_pool =
has_coarse_pool ? dst_coarse_pool : dst_fine_pool;
uint32_t hop_count = 0;
IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info(
IREE_LIBHSA(libhsa), source_agent, link_query_pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &hop_count));
if (hop_count > IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS) {
return iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"HSA reports %" PRIu32 " link hops between GPU agents (max %" PRIhsz
")",
hop_count, (iree_host_size_t)IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS);
}
if (hop_count > 0) {
// The LINK_INFO query writes exactly hop_count entries into the caller's
// buffer with no separate size parameter.
hsa_amd_memory_pool_link_info_t
link_info[IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS];
memset(link_info, 0, sizeof(link_info[0]) * hop_count);
IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info(
IREE_LIBHSA(libhsa), source_agent, link_query_pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info));
for (uint32_t i = 0; i < hop_count; ++i) {
iree_hal_topology_link_class_t hop_class =
iree_hal_amdgpu_link_type_to_link_class(link_info[i].link_type);
if (hop_class > out_physical_edge->link_class) {
out_physical_edge->link_class = hop_class;
}
uint8_t numa_distance = iree_hal_amdgpu_topology_scale_hsa_numa_distance(
link_info[i].numa_distance);
if (numa_distance > out_physical_edge->numa_distance) {
out_physical_edge->numa_distance = numa_distance;
}
if (!link_info[i].coherent_support) {
out_physical_edge->all_hops_coherent = false;
}
if (!link_info[i].atomic_support_32bit) {
out_physical_edge->all_hops_atomic_32bit = false;
}
if (!link_info[i].atomic_support_64bit) {
out_physical_edge->all_hops_atomic_64bit = false;
}
}
}
if (!out_physical_edge->coarse_accessible &&
!out_physical_edge->fine_accessible) {
out_physical_edge->link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_HOST_STAGED;
out_physical_edge->all_hops_coherent = false;
out_physical_edge->all_hops_atomic_32bit = false;
out_physical_edge->all_hops_atomic_64bit = false;
}
iree_hal_amdgpu_topology_costs_from_link_class(
out_physical_edge->link_class, &out_physical_edge->copy_cost,
&out_physical_edge->latency_class);
return iree_ok_status();
}
static void iree_hal_amdgpu_topology_edge_aggregate_initialize(
iree_hal_topology_edge_t edge,
iree_hal_amdgpu_topology_edge_aggregate_t* out_aggregate) {
// Start physical facts at their best value so the aggregate can both upgrade
// an imprecise base edge and then monotonically worsen with each pair.
// Per-pair DISALLOWED_BY_DEFAULT access falls back to the base edge mode in
// iree_hal_amdgpu_topology_edge_aggregate_include.
out_aggregate->physical_capabilities =
IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY |
IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT |
IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE |
IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM;
out_aggregate->noncoherent_read_mode = IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
out_aggregate->noncoherent_write_mode = IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
out_aggregate->coherent_read_mode = IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
out_aggregate->coherent_write_mode = IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
out_aggregate->link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE;
out_aggregate->copy_cost = 0;
out_aggregate->latency_class = 0;
out_aggregate->numa_distance = iree_hal_topology_edge_numa_distance(edge.lo);
}
static void iree_hal_amdgpu_topology_edge_aggregate_include(
iree_hal_topology_edge_t base_edge,
const iree_hal_amdgpu_physical_topology_edge_t* physical_edge,
iree_hal_amdgpu_topology_edge_aggregate_t* aggregate) {
aggregate->physical_capabilities &=
iree_hal_amdgpu_physical_topology_capabilities(physical_edge);
aggregate->noncoherent_read_mode = iree_max(
aggregate->noncoherent_read_mode,
iree_hal_amdgpu_topology_mode_from_memory_pool_access(
physical_edge->coarse_access,
iree_hal_topology_edge_buffer_read_mode_noncoherent(base_edge.lo)));
aggregate->noncoherent_write_mode = iree_max(
aggregate->noncoherent_write_mode,
iree_hal_amdgpu_topology_mode_from_memory_pool_access(
physical_edge->coarse_access,
iree_hal_topology_edge_buffer_write_mode_noncoherent(base_edge.lo)));
aggregate->coherent_read_mode = iree_max(
aggregate->coherent_read_mode,
iree_hal_amdgpu_topology_mode_from_memory_pool_access(
physical_edge->fine_access,
iree_hal_topology_edge_buffer_read_mode_coherent(base_edge.lo)));
aggregate->coherent_write_mode = iree_max(
aggregate->coherent_write_mode,
iree_hal_amdgpu_topology_mode_from_memory_pool_access(
physical_edge->fine_access,
iree_hal_topology_edge_buffer_write_mode_coherent(base_edge.lo)));
if (physical_edge->link_class > aggregate->link_class) {
aggregate->link_class = physical_edge->link_class;
}
if (physical_edge->copy_cost > aggregate->copy_cost) {
aggregate->copy_cost = physical_edge->copy_cost;
}
if (physical_edge->latency_class > aggregate->latency_class) {
aggregate->latency_class = physical_edge->latency_class;
}
if (physical_edge->numa_distance > aggregate->numa_distance) {
aggregate->numa_distance = physical_edge->numa_distance;
}
}
static void iree_hal_amdgpu_topology_edge_apply_aggregate(
const iree_hal_amdgpu_topology_edge_aggregate_t* aggregate,
iree_hal_topology_edge_t* edge) {
edge->lo = iree_hal_topology_edge_set_buffer_read_mode_noncoherent(
edge->lo, aggregate->noncoherent_read_mode);
edge->lo = iree_hal_topology_edge_set_buffer_write_mode_noncoherent(
edge->lo, aggregate->noncoherent_write_mode);
edge->lo = iree_hal_topology_edge_set_buffer_read_mode_coherent(
edge->lo, aggregate->coherent_read_mode);
edge->lo = iree_hal_topology_edge_set_buffer_write_mode_coherent(
edge->lo, aggregate->coherent_write_mode);
edge->lo =
iree_hal_topology_edge_set_link_class(edge->lo, aggregate->link_class);
edge->lo =
iree_hal_topology_edge_set_copy_cost(edge->lo, aggregate->copy_cost);
edge->lo = iree_hal_topology_edge_set_latency_class(edge->lo,
aggregate->latency_class);
edge->lo = iree_hal_topology_edge_set_numa_distance(edge->lo,
aggregate->numa_distance);
iree_hal_topology_capability_t capabilities =
iree_hal_topology_edge_capability_flags(edge->lo);
const iree_hal_topology_capability_t physical_capability_mask =
IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY |
IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT |
IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE |
IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM;
capabilities &= ~physical_capability_mask;
capabilities |= aggregate->physical_capabilities & physical_capability_mask;
edge->lo =
iree_hal_topology_edge_set_capability_flags(edge->lo, capabilities);
}
static iree_status_t iree_hal_amdgpu_logical_device_refine_topology_edge(
iree_hal_device_t* src_device, iree_hal_device_t* dst_device,
iree_hal_topology_edge_t* edge) {
iree_hal_amdgpu_logical_device_t* src_logical =
iree_hal_amdgpu_logical_device_cast(src_device);
iree_hal_amdgpu_logical_device_t* dst_logical =
iree_hal_amdgpu_logical_device_cast(dst_device);
const iree_hal_amdgpu_libhsa_t* libhsa = &src_logical->system->libhsa;
if (src_logical->physical_device_count == 0 ||
dst_logical->physical_device_count == 0) {
return iree_make_status(
IREE_STATUS_INTERNAL,
"cannot refine AMDGPU topology edge with an empty physical device set");
}
iree_hal_amdgpu_topology_edge_aggregate_t aggregate;
iree_hal_amdgpu_topology_edge_aggregate_initialize(*edge, &aggregate);
// A composite logical device has one generic HAL topology node but several
// physical HSA agents. The generic edge must be valid for any source/dest
// physical pair because the scheduler cannot encode a subset-specific edge.
for (iree_host_size_t source_index = 0;
source_index < src_logical->physical_device_count; ++source_index) {
const iree_hal_amdgpu_physical_device_t* source_physical_device =
src_logical->physical_devices[source_index];
for (iree_host_size_t destination_index = 0;
destination_index < dst_logical->physical_device_count;
++destination_index) {
const iree_hal_amdgpu_physical_device_t* destination_physical_device =
dst_logical->physical_devices[destination_index];
iree_hal_amdgpu_physical_topology_edge_t physical_edge;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_query_physical_topology_edge(
libhsa, source_physical_device, destination_physical_device,
&physical_edge));
iree_hal_amdgpu_topology_edge_aggregate_include(*edge, &physical_edge,
&aggregate);
}
}
iree_hal_amdgpu_topology_edge_apply_aggregate(&aggregate, edge);
return iree_ok_status();
}
static iree_status_t iree_hal_amdgpu_logical_device_assign_topology_info(
iree_hal_device_t* base_device,
const iree_hal_device_topology_info_t* topology_info) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
if (!topology_info) {
iree_hal_amdgpu_logical_device_deassign_frontier(logical_device);
return iree_ok_status();
}
IREE_TRACE_ZONE_BEGIN(z0);
iree_hal_amdgpu_system_t* system = logical_device->system;
const uint8_t device_count = (uint8_t)system->topology.gpu_agent_count;
const uint8_t queue_stride = (uint8_t)system->topology.gpu_agent_queue_count;
const iree_host_size_t table_size =
iree_hal_amdgpu_epoch_signal_table_size(device_count, queue_stride);
iree_status_t status =
iree_allocator_malloc(logical_device->host_allocator, table_size,
(void**)&logical_device->host_queue_epoch_table);
if (iree_status_is_ok(status)) {
iree_hal_amdgpu_epoch_signal_table_initialize(
logical_device->host_queue_epoch_table,
iree_async_axis_session(topology_info->frontier.base_axis),
iree_async_axis_machine(topology_info->frontier.base_axis),
device_count, queue_stride);
}
for (iree_host_size_t device_ordinal = 0;
device_ordinal < logical_device->physical_device_count &&
iree_status_is_ok(status);
++device_ordinal) {
const iree_host_size_t host_ordinal =
system->topology.gpu_cpu_map[device_ordinal];
status = iree_hal_amdgpu_physical_device_assign_frontier(
base_device, system, logical_device->proactor,
topology_info->frontier.tracker, topology_info->frontier.base_axis,
logical_device->host_queue_epoch_table,
&system->host_memory_pools[host_ordinal],
logical_device->host_allocator,
logical_device->physical_devices[device_ordinal]);
}
if (iree_status_is_ok(status)) {
logical_device->topology_info = *topology_info;
logical_device->frontier_tracker = topology_info->frontier.tracker;
logical_device->axis = topology_info->frontier.base_axis;
iree_async_frontier_tracker_retain(logical_device->frontier_tracker);
} else {
iree_hal_amdgpu_logical_device_deassign_frontier(logical_device);
}
IREE_TRACE_ZONE_END(z0);
return status;
}
static iree_status_t iree_hal_amdgpu_logical_device_create_channel(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
iree_hal_channel_params_t params, iree_hal_channel_t** out_channel) {
return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
"AMDGPU collective channels not yet implemented");
}
static iree_status_t iree_hal_amdgpu_logical_device_create_command_buffer(
iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode,
iree_hal_command_category_t command_categories,
iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
iree_hal_command_buffer_t** out_command_buffer) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_queue_affinity_t effective_queue_affinity = 0;
iree_host_size_t device_ordinal = 0;
IREE_RETURN_IF_ERROR(
iree_hal_amdgpu_logical_device_normalize_command_buffer_affinity(
logical_device, queue_affinity, &effective_queue_affinity,
&device_ordinal));
const iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[device_ordinal];
return iree_hal_amdgpu_aql_command_buffer_create(
iree_hal_device_allocator(base_device), mode, command_categories,
effective_queue_affinity, binding_capacity, device_ordinal,
physical_device->prepublished_kernarg_storage,
&logical_device->profile_metadata,
&logical_device->host_block_pools.command_buffer,
&logical_device->host_block_pools.small, logical_device->host_allocator,
out_command_buffer);
}
static iree_status_t iree_hal_amdgpu_logical_device_create_event(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
iree_hal_event_flags_t flags, iree_hal_event_t** out_event) {
return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
"AMDGPU events not yet implemented");
}
static iree_status_t iree_hal_amdgpu_logical_device_create_executable_cache(
iree_hal_device_t* base_device, iree_string_view_t identifier,
iree_hal_executable_cache_t** out_executable_cache) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
return iree_hal_amdgpu_executable_cache_create(
&logical_device->system->libhsa, &logical_device->system->topology,
&logical_device->profile_metadata, identifier,
iree_hal_device_host_allocator(base_device), out_executable_cache);
}
static iree_status_t iree_hal_amdgpu_logical_device_import_file(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
iree_hal_memory_access_t access, iree_io_file_handle_t* handle,
iree_hal_external_file_flags_t flags, iree_hal_file_t** out_file) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_normalize(
logical_device->queue_affinity_mask, queue_affinity, &queue_affinity));
return iree_hal_file_from_handle(
iree_hal_device_allocator(base_device), queue_affinity, access, handle,
logical_device->proactor, iree_hal_device_host_allocator(base_device),
out_file);
}
static iree_status_t iree_hal_amdgpu_logical_device_create_semaphore(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
uint64_t initial_value, iree_hal_semaphore_flags_t flags,
iree_hal_semaphore_t** out_semaphore) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
return iree_hal_amdgpu_semaphore_create(
logical_device, logical_device->proactor, queue_affinity, initial_value,
flags, logical_device->host_allocator, out_semaphore);
}
static iree_hal_semaphore_compatibility_t
iree_hal_amdgpu_logical_device_query_semaphore_compatibility(
iree_hal_device_t* base_device, iree_hal_semaphore_t* semaphore) {
if (iree_hal_amdgpu_semaphore_isa(semaphore)) {
return IREE_HAL_SEMAPHORE_COMPATIBILITY_ALL;
}
return IREE_HAL_SEMAPHORE_COMPATIBILITY_HOST_ONLY;
}
static iree_status_t iree_hal_amdgpu_logical_device_query_queue_pool_backend(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
iree_hal_queue_pool_backend_t* out_backend) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_physical_device_t* physical_device = NULL;
IREE_RETURN_IF_ERROR(
iree_hal_amdgpu_logical_device_select_queue_pool_physical_device(
logical_device, queue_affinity, &physical_device));
out_backend->slab_provider = physical_device->default_slab_provider;
out_backend->notification = physical_device->default_pool_notification;
out_backend->epoch_query = (iree_hal_pool_epoch_query_t){
.fn = iree_hal_amdgpu_logical_device_query_pool_epoch,
.user_data = logical_device,
};
return iree_ok_status();
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_alloca(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_pool_t* pool, iree_hal_buffer_params_t params,
iree_device_size_t allocation_size, iree_hal_alloca_flags_t flags,
iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->alloca(queue, wait_semaphore_list,
signal_semaphore_list, pool, params,
allocation_size, flags, out_buffer);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_dealloca(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* buffer, iree_hal_dealloca_flags_t flags) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->dealloca(queue, wait_semaphore_list,
signal_semaphore_list, buffer, flags);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_fill(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, const void* pattern,
iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
// Match the HAL contract documented on iree_hal_command_buffer_fill_buffer
// (1/2/4-byte patterns only) so queue_fill and command_buffer_fill accept
// the same inputs across all backends. The device kernel itself supports an
// 8-byte pattern path via iree_hal_amdgpu_device_buffer_fill_x8, but we
// deliberately do not expose that here — callers writing 8-byte fills would
// then be portable only to amdgpu.
if (IREE_UNLIKELY(pattern_length != 1 && pattern_length != 2 &&
pattern_length != 4)) {
return iree_make_status(
IREE_STATUS_INVALID_ARGUMENT,
"fill patterns must be 1, 2, or 4 bytes (got %" PRIhsz ")",
pattern_length);
}
if (IREE_UNLIKELY(!pattern)) {
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"fill pattern pointer is required");
}
uint64_t pattern_bits = 0;
memcpy(&pattern_bits, pattern, pattern_length);
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->fill(queue, wait_semaphore_list, signal_semaphore_list,
target_buffer, target_offset, length, pattern_bits,
pattern_length, flags);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_update(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
const void* source_buffer, iree_host_size_t source_offset,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, iree_hal_update_flags_t flags) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->update(
queue, wait_semaphore_list, signal_semaphore_list, source_buffer,
source_offset, target_buffer, target_offset, length, flags);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_copy(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, iree_hal_copy_flags_t flags) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->copy(queue, wait_semaphore_list, signal_semaphore_list,
source_buffer, source_offset, target_buffer,
target_offset, length, flags);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_read(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_file_t* source_file, uint64_t source_offset,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, iree_hal_read_flags_t flags) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->read(queue, wait_semaphore_list, signal_semaphore_list,
source_file, source_offset, target_buffer,
target_offset, length, flags);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_write(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
iree_hal_file_t* target_file, uint64_t target_offset,
iree_device_size_t length, iree_hal_write_flags_t flags) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->write(queue, wait_semaphore_list, signal_semaphore_list,
source_buffer, source_offset, target_file,
target_offset, length, flags);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_host_call(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_host_call_t call, const uint64_t args[4],
iree_hal_host_call_flags_t flags) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->host_call(queue, wait_semaphore_list,
signal_semaphore_list, call, args, flags);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_dispatch(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_executable_t* executable,
iree_hal_executable_export_ordinal_t export_ordinal,
const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
const iree_hal_buffer_ref_list_t bindings,
iree_hal_dispatch_flags_t flags) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->dispatch(
queue, wait_semaphore_list, signal_semaphore_list, executable,
export_ordinal, config, constants, bindings, flags);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_execute(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_command_buffer_t* command_buffer,
iree_hal_buffer_binding_table_t binding_table,
iree_hal_execute_flags_t flags) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
logical_device, queue_affinity, &queue));
return queue->vtable->execute(queue, wait_semaphore_list,
signal_semaphore_list, command_buffer,
binding_table, flags);
}
static iree_status_t iree_hal_amdgpu_logical_device_queue_flush(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_normalize(
logical_device->queue_affinity_mask, queue_affinity, &queue_affinity));
IREE_HAL_FOR_QUEUE_AFFINITY(queue_affinity) {
iree_hal_amdgpu_virtual_queue_t* queue = NULL;
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_queue_from_ordinal(
logical_device, queue_ordinal, &queue));
IREE_RETURN_IF_ERROR(queue->vtable->flush(queue));
}
return iree_ok_status();
}
static iree_status_t
iree_hal_amdgpu_logical_device_verify_queue_device_profiling_supported(
iree_hal_amdgpu_logical_device_t* logical_device) {
for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
iree_hal_amdgpu_physical_device_t* physical_device =
logical_device->physical_devices[i];
if (iree_hal_amdgpu_vendor_packet_capabilities_support_timestamp_range(
physical_device->vendor_packet_capabilities)) {
continue;
}
return iree_make_status(
IREE_STATUS_FAILED_PRECONDITION,
"AMDGPU queue operation profiling requires PM4 timestamp range "
"support on physical device %" PRIhsz,
physical_device->device_ordinal);
}
return iree_ok_status();
}
static iree_status_t iree_hal_amdgpu_logical_device_profiling_begin(
iree_hal_device_t* base_device,
const iree_hal_device_profiling_options_t* options) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_hal_device_profiling_options_t resolved_options =
iree_hal_amdgpu_logical_device_resolve_profiling_options(options);
if (iree_hal_device_profiling_options_requests_data(
&resolved_options,
IREE_HAL_DEVICE_PROFILING_DATA_HOST_EXECUTION_EVENTS)) {
return iree_make_status(
IREE_STATUS_UNIMPLEMENTED,
"AMDGPU profiling does not produce host execution events");
}
if (resolved_options.data_families == IREE_HAL_DEVICE_PROFILING_DATA_NONE) {
return iree_ok_status();
}
if (!logical_device->frontier_tracker) {
return iree_make_status(
IREE_STATUS_FAILED_PRECONDITION,
"AMDGPU profiling requires an assigned device topology");
}
if (logical_device->profiling.options.data_families !=
IREE_HAL_DEVICE_PROFILING_DATA_NONE) {
return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
"cannot nest AMDGPU profile captures");
}
if (iree_hal_device_profiling_options_requests_data(
&resolved_options,
IREE_HAL_DEVICE_PROFILING_DATA_DEVICE_QUEUE_EVENTS)) {
IREE_RETURN_IF_ERROR(
iree_hal_amdgpu_logical_device_verify_queue_device_profiling_supported(
logical_device));
}
bool sink_session_begun = false;
bool hsa_profiling_enabled = false;
bool counter_profiling_enabled = false;
bool trace_profiling_enabled = false;
iree_hal_device_profiling_options_t session_options = {0};
iree_hal_device_profiling_options_storage_t* options_storage = NULL;
iree_hal_amdgpu_profile_counter_session_t* counter_session = NULL;
iree_hal_amdgpu_profile_trace_session_t* trace_session = NULL;
iree_hal_amdgpu_profile_device_metrics_session_t* device_metrics_session =
NULL;
iree_status_t status = iree_hal_device_profiling_options_clone(
&resolved_options, logical_device->host_allocator, &session_options,
&options_storage);
iree_hal_profile_sink_t* sink = session_options.sink;
uint64_t session_id = 0;
iree_hal_profile_chunk_metadata_t metadata = {0};
if (iree_status_is_ok(status)) {
session_id = logical_device->next_profile_session_id++;
metadata = iree_hal_amdgpu_logical_device_profile_session_metadata(
logical_device, session_id);
logical_device->profiling.next_clock_correlation_sample_id = 1;
memset(&logical_device->profiling.metadata_cursor, 0,
sizeof(logical_device->profiling.metadata_cursor));
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_profile_counter_session_allocate(
logical_device, &session_options, logical_device->host_allocator,
&counter_session);
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_profile_trace_session_allocate(
logical_device, &session_options, logical_device->host_allocator,
&trace_session);
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_profile_device_metrics_session_allocate(
logical_device, &session_options, logical_device->host_allocator,
&device_metrics_session);
}
if (iree_status_is_ok(status)) {
status = iree_hal_profile_sink_begin_session(sink, &metadata);
sink_session_begun = iree_status_is_ok(status);
}
if (iree_status_is_ok(status) &&
iree_hal_device_profiling_options_requests_data(
&session_options, IREE_HAL_DEVICE_PROFILING_DATA_QUEUE_EVENTS)) {
status = iree_hal_amdgpu_profile_event_streams_ensure_queue_storage(
&logical_device->profiling.event_streams,
IREE_HAL_AMDGPU_LOGICAL_DEVICE_PROFILE_QUEUE_EVENT_CAPACITY,
logical_device->host_allocator);
if (iree_status_is_ok(status)) {
iree_hal_amdgpu_profile_event_streams_clear_queue(
&logical_device->profiling.event_streams);
}
}
if (iree_status_is_ok(status) &&
iree_hal_device_profiling_options_requests_data(
&session_options, IREE_HAL_DEVICE_PROFILING_DATA_MEMORY_EVENTS)) {
status = iree_hal_amdgpu_profile_event_streams_ensure_memory_storage(
&logical_device->profiling.event_streams,
IREE_HAL_AMDGPU_LOGICAL_DEVICE_PROFILE_MEMORY_EVENT_CAPACITY,
logical_device->host_allocator);
if (iree_status_is_ok(status)) {
iree_hal_amdgpu_profile_event_streams_clear_memory(
&logical_device->profiling.event_streams);
}
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_logical_device_write_profile_metadata(
logical_device, sink, session_id, session_options.data_families);
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_profile_counter_session_write_metadata(
counter_session, sink, session_id, logical_device->identifier);
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_profile_device_metrics_session_write_metadata(
device_metrics_session, sink, session_id, logical_device->identifier);
}
if (iree_status_is_ok(status) &&
iree_hal_amdgpu_logical_device_profiling_needs_hsa_timestamps(
session_options.data_families)) {
status = iree_hal_amdgpu_logical_device_set_hsa_profiling_enabled(
logical_device, true);
hsa_profiling_enabled = iree_status_is_ok(status);
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_logical_device_set_counter_profiling_enabled(
logical_device, counter_session, true);
counter_profiling_enabled = iree_status_is_ok(status);
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_logical_device_set_trace_profiling_enabled(
logical_device, trace_session, true);
trace_profiling_enabled = iree_status_is_ok(status);
}
if (iree_status_is_ok(status)) {
logical_device->profiling.options = session_options;
logical_device->profiling.options_storage = options_storage;
logical_device->profiling.session_id = session_id;
logical_device->profiling.counter_session = counter_session;
logical_device->profiling.trace_session = trace_session;
logical_device->profiling.device_metrics_session = device_metrics_session;
iree_hal_amdgpu_logical_device_set_queue_profiling_enabled(
logical_device,
iree_hal_amdgpu_logical_device_queue_profile_flags(&session_options));
} else {
if (trace_profiling_enabled) {
status = iree_status_join(
status, iree_hal_amdgpu_logical_device_set_trace_profiling_enabled(
logical_device, trace_session, false));
}
if (counter_profiling_enabled) {
status = iree_status_join(
status, iree_hal_amdgpu_logical_device_set_counter_profiling_enabled(
logical_device, counter_session, false));
}
if (hsa_profiling_enabled) {
status = iree_status_join(
status, iree_hal_amdgpu_logical_device_set_hsa_profiling_enabled(
logical_device, false));
}
if (sink_session_begun) {
status = iree_status_join(
status, iree_hal_profile_sink_end_session(sink, &metadata,
iree_status_code(status)));
}
logical_device->profiling.next_clock_correlation_sample_id = 0;
memset(&logical_device->profiling.metadata_cursor, 0,
sizeof(logical_device->profiling.metadata_cursor));
iree_hal_device_profiling_options_storage_free(
options_storage, logical_device->host_allocator);
iree_hal_amdgpu_profile_counter_session_free(counter_session);
iree_hal_amdgpu_profile_trace_session_free(trace_session);
iree_hal_amdgpu_profile_device_metrics_session_free(device_metrics_session);
}
return status;
}
static iree_status_t iree_hal_amdgpu_logical_device_profiling_flush(
iree_hal_device_t* base_device) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
const iree_hal_device_profiling_options_t* options =
&logical_device->profiling.options;
if (options->data_families == IREE_HAL_DEVICE_PROFILING_DATA_NONE) {
return iree_ok_status();
}
iree_hal_profile_sink_t* sink = options->sink;
const bool emit_executable_artifacts =
iree_hal_amdgpu_logical_device_profile_needs_executable_artifacts(
options->data_families);
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_profile_metadata_write(
&logical_device->profile_metadata, sink,
logical_device->profiling.session_id, logical_device->identifier,
emit_executable_artifacts, &logical_device->profiling.metadata_cursor));
IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_write_profile_events(
logical_device, sink, logical_device->profiling.session_id));
IREE_RETURN_IF_ERROR(
iree_hal_amdgpu_logical_device_write_profile_clock_correlations(
logical_device, sink, logical_device->profiling.session_id));
return iree_hal_amdgpu_profile_device_metrics_session_sample_and_write(
logical_device->profiling.device_metrics_session, sink,
logical_device->profiling.session_id, logical_device->identifier);
}
static iree_status_t iree_hal_amdgpu_logical_device_profiling_end(
iree_hal_device_t* base_device) {
iree_hal_amdgpu_logical_device_t* logical_device =
iree_hal_amdgpu_logical_device_cast(base_device);
iree_status_t status = iree_ok_status();
const iree_hal_device_profiling_data_families_t data_families =
logical_device->profiling.options.data_families;
if (data_families == IREE_HAL_DEVICE_PROFILING_DATA_NONE) {
return iree_ok_status();
}
iree_hal_profile_sink_t* sink = logical_device->profiling.options.sink;
iree_hal_amdgpu_profile_counter_session_t* counter_session =
logical_device->profiling.counter_session;
iree_hal_amdgpu_profile_trace_session_t* trace_session =
logical_device->profiling.trace_session;
iree_hal_amdgpu_profile_device_metrics_session_t* device_metrics_session =
logical_device->profiling.device_metrics_session;
const uint64_t session_id = logical_device->profiling.session_id;
iree_hal_profile_chunk_metadata_t metadata =
iree_hal_amdgpu_logical_device_profile_session_metadata(logical_device,
session_id);
const bool emit_executable_artifacts =
iree_hal_amdgpu_logical_device_profile_needs_executable_artifacts(
data_families);
status = iree_hal_amdgpu_profile_metadata_write(
&logical_device->profile_metadata, sink, session_id,
logical_device->identifier, emit_executable_artifacts,
&logical_device->profiling.metadata_cursor);
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_logical_device_write_profile_events(
logical_device, sink, session_id);
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_logical_device_write_profile_clock_correlations(
logical_device, sink, session_id);
}
if (iree_status_is_ok(status)) {
status = iree_hal_amdgpu_profile_device_metrics_session_sample_and_write(
device_metrics_session, sink, session_id, logical_device->identifier);
}
status = iree_status_join(
status, iree_hal_amdgpu_logical_device_set_trace_profiling_enabled(
logical_device, trace_session, false));
status = iree_status_join(
status, iree_hal_amdgpu_logical_device_set_counter_profiling_enabled(
logical_device, counter_session, false));
if (iree_hal_amdgpu_logical_device_profiling_needs_hsa_timestamps(
data_families)) {
status = iree_status_join(
status, iree_hal_amdgpu_logical_device_set_hsa_profiling_enabled(
logical_device, false));
}
status =
iree_status_join(status, iree_hal_profile_sink_end_session(
sink, &metadata, iree_status_code(status)));
iree_hal_amdgpu_logical_device_reset_profile_options(logical_device);
logical_device->profiling.session_id = 0;
logical_device->profiling.next_clock_correlation_sample_id = 0;
memset(&logical_device->profiling.metadata_cursor, 0,
sizeof(logical_device->profiling.metadata_cursor));
logical_device->profiling.counter_session = NULL;
logical_device->profiling.trace_session = NULL;
logical_device->profiling.device_metrics_session = NULL;
iree_hal_amdgpu_logical_device_set_queue_profiling_enabled(
logical_device, IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_NONE);
iree_hal_amdgpu_profile_counter_session_free(counter_session);
iree_hal_amdgpu_profile_trace_session_free(trace_session);
iree_hal_amdgpu_profile_device_metrics_session_free(device_metrics_session);
return status;
}
static iree_status_t iree_hal_amdgpu_logical_device_external_capture_begin(
iree_hal_device_t* base_device,
const iree_hal_device_external_capture_options_t* options) {
(void)base_device;
(void)options;
return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
"AMDGPU external capture not implemented");
}
static iree_status_t iree_hal_amdgpu_logical_device_external_capture_end(
iree_hal_device_t* base_device) {
(void)base_device;
return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
"AMDGPU external capture not implemented");
}
static const iree_hal_device_vtable_t iree_hal_amdgpu_logical_device_vtable = {
.destroy = iree_hal_amdgpu_logical_device_destroy,
.id = iree_hal_amdgpu_logical_device_id,
.host_allocator = iree_hal_amdgpu_logical_device_host_allocator,
.device_allocator = iree_hal_amdgpu_logical_device_allocator,
.replace_device_allocator = iree_hal_amdgpu_replace_device_allocator,
.replace_channel_provider = iree_hal_amdgpu_replace_channel_provider,
.trim = iree_hal_amdgpu_logical_device_trim,
.query_i64 = iree_hal_amdgpu_logical_device_query_i64,
.query_capabilities = iree_hal_amdgpu_logical_device_query_capabilities,
.topology_info = iree_hal_amdgpu_logical_device_topology_info,
.refine_topology_edge = iree_hal_amdgpu_logical_device_refine_topology_edge,
.assign_topology_info = iree_hal_amdgpu_logical_device_assign_topology_info,
.create_channel = iree_hal_amdgpu_logical_device_create_channel,
.create_command_buffer =
iree_hal_amdgpu_logical_device_create_command_buffer,
.create_event = iree_hal_amdgpu_logical_device_create_event,
.create_executable_cache =
iree_hal_amdgpu_logical_device_create_executable_cache,
.import_file = iree_hal_amdgpu_logical_device_import_file,
.create_semaphore = iree_hal_amdgpu_logical_device_create_semaphore,
.query_semaphore_compatibility =
iree_hal_amdgpu_logical_device_query_semaphore_compatibility,
.query_queue_pool_backend =
iree_hal_amdgpu_logical_device_query_queue_pool_backend,
.queue_alloca = iree_hal_amdgpu_logical_device_queue_alloca,
.queue_dealloca = iree_hal_amdgpu_logical_device_queue_dealloca,
.queue_fill = iree_hal_amdgpu_logical_device_queue_fill,
.queue_update = iree_hal_amdgpu_logical_device_queue_update,
.queue_copy = iree_hal_amdgpu_logical_device_queue_copy,
.queue_read = iree_hal_amdgpu_logical_device_queue_read,
.queue_write = iree_hal_amdgpu_logical_device_queue_write,
.queue_host_call = iree_hal_amdgpu_logical_device_queue_host_call,
.queue_dispatch = iree_hal_amdgpu_logical_device_queue_dispatch,
.queue_execute = iree_hal_amdgpu_logical_device_queue_execute,
.queue_flush = iree_hal_amdgpu_logical_device_queue_flush,
.profiling_begin = iree_hal_amdgpu_logical_device_profiling_begin,
.profiling_flush = iree_hal_amdgpu_logical_device_profiling_flush,
.profiling_end = iree_hal_amdgpu_logical_device_profiling_end,
.external_capture_begin =
iree_hal_amdgpu_logical_device_external_capture_begin,
.external_capture_end = iree_hal_amdgpu_logical_device_external_capture_end,
};