blob: 867f799a9ca019028e7114a6e70f2a9e9ee54f34 [file]
// Copyright 2025 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_HAL_DRIVERS_AMDGPU_PHYSICAL_DEVICE_H_
#define IREE_HAL_DRIVERS_AMDGPU_PHYSICAL_DEVICE_H_
#include "iree/base/api.h"
#include "iree/base/internal/arena.h"
#include "iree/hal/drivers/amdgpu/buffer.h"
#include "iree/hal/drivers/amdgpu/host_queue.h"
#include "iree/hal/drivers/amdgpu/host_queue_staging.h"
#include "iree/hal/drivers/amdgpu/physical_device_capabilities.h"
#include "iree/hal/drivers/amdgpu/system.h"
#include "iree/hal/drivers/amdgpu/transient_buffer.h"
#include "iree/hal/drivers/amdgpu/util/block_pool.h"
#include "iree/hal/drivers/amdgpu/util/libhsa.h"
#include "iree/hal/drivers/amdgpu/util/signal_pool.h"
#include "iree/hal/drivers/amdgpu/util/target_id.h"
#include "iree/hal/memory/slab_provider.h"
#include "iree/hal/memory/tlsf_pool.h"
#include "iree/hal/pool.h"
#include "iree/hal/pool_set.h"
typedef struct iree_hal_amdgpu_host_memory_pools_t
iree_hal_amdgpu_host_memory_pools_t;
//===----------------------------------------------------------------------===//
// iree_hal_amdgpu_physical_device_options_t
//===----------------------------------------------------------------------===//
// Power-of-two size for the per-device small block pool in bytes.
// Used for command buffer headers and other small data structures.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_SMALL_DEVICE_BLOCK_SIZE_DEFAULT \
(32 * 1024)
// Minimum number of small blocks per device allocation.
// Reduces allocation overhead at the cost of under-utilizing memory.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_SMALL_DEVICE_BLOCKS_PER_ALLOCATION_DEFAULT \
(128)
// Initial capacity in blocks of the per-device small block pool. Block pools
// will grow as needed but accounting is cleaner if we pre-initialize them to a
// (hopefully) sufficient size.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_SMALL_DEVICE_BLOCK_INITIAL_CAPACITY_DEFAULT \
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_SMALL_DEVICE_BLOCKS_PER_ALLOCATION_DEFAULT
// Power-of-two size for the per-device large block pool in bytes.
// Used for command buffer commands and data. Must be large enough to fit inline
// command buffer uploads.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_LARGE_DEVICE_BLOCK_SIZE_DEFAULT \
(256 * 1024)
// Minimum number of large blocks per device allocation.
// Reduces allocation overhead at the cost of under-utilizing memory.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_LARGE_DEVICE_BLOCKS_PER_ALLOCATION_DEFAULT \
(16)
// Initial capacity in blocks of the per-device large block pool. Block pools
// will grow as needed but accounting is cleaner if we pre-initialize them to a
// (hopefully) sufficient size.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_LARGE_DEVICE_BLOCK_INITIAL_CAPACITY_DEFAULT \
IREE_HAL_AMDGPU_PHYSICAL_DEVICE_LARGE_DEVICE_BLOCKS_PER_ALLOCATION_DEFAULT
// Power-of-two size for the per-device host block pool in bytes.
// Since primarily used for transient submission-specific allocations it need
// not be large.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_HOST_BLOCK_SIZE_DEFAULT (8 * 1024)
// Logical byte length for the default per-device queue-allocation pool.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_POOL_RANGE_LENGTH_DEFAULT \
(64 * 1024 * 1024)
// Logical byte length for host-visible default queue-allocation pool slabs.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_HOST_POOL_RANGE_LENGTH_DEFAULT \
(64 * 1024)
// Minimum byte alignment for default-pool suballocations.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_POOL_ALIGNMENT_DEFAULT 256
// Maximum death-frontier entries stored per free default-pool block.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_POOL_FRONTIER_CAPACITY_DEFAULT \
IREE_HAL_MEMORY_TLSF_DEFAULT_FRONTIER_CAPACITY
// Total number of HAL queues on the physical device.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_QUEUE_COUNT \
IREE_HAL_AMDGPU_DEFAULT_GPU_AGENT_QUEUE_COUNT
// Default per-queue hardware AQL ring capacity in packets.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_AQL_CAPACITY \
IREE_HAL_AMDGPU_DEFAULT_EXECUTION_QUEUE_CAPACITY
// Default per-queue completion/reclaim ring capacity in epochs and hot entries.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_NOTIFICATION_CAPACITY \
IREE_HAL_AMDGPU_DEFAULT_NOTIFICATION_CAPACITY
// Default per-queue kernarg ring capacity in 64-byte blocks.
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_KERNARG_CAPACITY \
((uint32_t)(IREE_HAL_AMDGPU_DEFAULT_KERNARG_RINGBUFFER_CAPACITY / \
sizeof(iree_hal_amdgpu_kernarg_block_t)))
#define IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_UPLOAD_CAPACITY 0
// Options controlling how a physical device is initialized.
typedef struct iree_hal_amdgpu_physical_device_options_t {
// Size of a block in each device block pool.
// Used for both coarse-grained and fine-grained memory types.
struct {
// Small device block pool.
// Used for command buffer headers and other small data structures.
iree_hal_amdgpu_block_pool_options_t small;
// Large device block pool.
// Used for command buffer commands and data. Must be large enough to fit
// inline command buffer uploads.
iree_hal_amdgpu_block_pool_options_t large;
} device_block_pools;
// Size of the per-device small host block pool.
// This is primarily used for per-submission resource sets and other transient
// bookkeeping that should never be _too_ large or live _too_ long.
iree_host_size_t host_block_pool_size;
// Initial block count preallocated for the host block pool.
iree_host_size_t host_block_pool_initial_capacity;
// Number of host queues created for this physical device.
iree_host_size_t host_queue_count;
// Per-host-queue HSA AQL ring capacity in packets.
uint32_t host_queue_aql_capacity;
// Per-host-queue completion/reclaim ring capacity.
uint32_t host_queue_notification_capacity;
// Per-host-queue kernarg ring capacity in 64-byte blocks.
uint32_t host_queue_kernarg_capacity;
// Per-host-queue device-visible control upload ring capacity in bytes. Zero
// disables the optional upload ring.
uint32_t host_queue_upload_capacity;
// Default queue-allocation pool policy.
struct {
// Logical byte length of the default TLSF pool range.
iree_device_size_t range_length;
// Minimum byte alignment for every default-pool reservation.
iree_device_size_t alignment;
// Maximum death-frontier entry count stored per free TLSF block.
uint8_t frontier_capacity;
} default_pool;
// Fixed-size queue_read/queue_write staging policy.
iree_hal_amdgpu_staging_pool_options_t file_staging;
// Forces cross-queue wait barriers to use software deferral instead of the
// optimal device-side strategy for the GPU ISA.
uint32_t force_wait_barrier_defer : 1;
} iree_hal_amdgpu_physical_device_options_t;
// Initializes |out_options| to its default values.
void iree_hal_amdgpu_physical_device_options_initialize(
iree_hal_amdgpu_physical_device_options_t* out_options);
// Verifies device options to ensure they meet the agent requirements.
iree_status_t iree_hal_amdgpu_physical_device_options_verify(
const iree_hal_amdgpu_physical_device_options_t* options,
const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t cpu_agent,
hsa_agent_t gpu_agent);
//===----------------------------------------------------------------------===//
// iree_hal_amdgpu_physical_device_t
//===----------------------------------------------------------------------===//
// A physical device representing an HSA GPU agent.
// May contain one or more HAL queues that map to HSA queues on the agent.
typedef struct iree_hal_amdgpu_physical_device_t {
// GPU agent.
hsa_agent_t device_agent;
// Ordinal of the GPU agent within the topology.
iree_host_size_t device_ordinal;
// HSA driver identifier used when querying per-device clock counters.
uint32_t driver_uid;
// PCI domain from HSA_AMD_AGENT_INFO_DOMAIN.
uint32_t pci_domain;
// PCI bus decoded from HSA_AMD_AGENT_INFO_BDFID.
uint32_t pci_bus;
// PCI device decoded from HSA_AMD_AGENT_INFO_BDFID.
uint32_t pci_device;
// PCI function decoded from HSA_AMD_AGENT_INFO_BDFID.
uint32_t pci_function;
// True when the PCI identity fields contain HSA-provided values.
uint32_t has_pci_identity : 1;
// HSA ISA identity selected for this GPU agent.
struct {
// Storage backing |target_id.processor|.
char target_id_processor[64];
// Parsed target identity, including XNACK/SRAMECC support and mode.
iree_hal_amdgpu_target_id_t target_id;
} isa;
// Stable physical device UUID bytes reported by HSA when available.
uint8_t physical_device_uuid[16];
// True when |physical_device_uuid| contains a stable HSA device identifier.
uint32_t has_physical_device_uuid : 1;
// NUMA node of the CPU agent nearest to |device_agent|.
uint32_t host_numa_node;
// Host memory pools for the CPU agent nearest to |device_agent|.
iree_hal_amdgpu_host_memory_pools_t host_memory_pools;
// Cold memory-system facts used to derive conservative topology flags.
iree_hal_amdgpu_memory_system_capabilities_t memory_system;
// CPU-visible coarse-grained device-memory capability for this GPU.
iree_hal_amdgpu_cpu_visible_device_coarse_memory_t
cpu_visible_device_coarse_memory;
// Prepublished command-buffer kernarg storage capability for this GPU.
iree_hal_amdgpu_aql_prepublished_kernarg_storage_t
prepublished_kernarg_storage;
// Fine-grained block pools for device memory blocks of various sizes.
iree_hal_amdgpu_block_pools_t fine_block_pools;
// Fine-grained block pool-based allocators for small transient allocations.
iree_hal_amdgpu_block_allocators_t fine_block_allocators;
// Coarse-grained block pools for device memory blocks of various sizes.
iree_hal_amdgpu_block_pools_t coarse_block_pools;
// Coarse-grained block pool-based allocators for small transient allocations.
iree_hal_amdgpu_block_allocators_t coarse_block_allocators;
// Host-side small allocation block pool.
// Shared amongst all queues in the physical device. We don't share with other
// devices as they may be attached to different NUMA nodes. Though still
// possible for queue entries to be allocated on one node and freed on another
// the common case will be that the blocks are touched by the same device.
iree_arena_block_pool_t fine_host_block_pool;
// Per-device pool of user-visible queue_alloca transient buffer wrappers.
iree_hal_amdgpu_transient_buffer_pool_t transient_buffer_pool;
// Per-device pool of materialized slab-backed HAL buffer view wrappers.
iree_hal_amdgpu_buffer_pool_t materialized_buffer_pool;
// Pool of HSA signals for host-waited semaphores and proactor integration.
iree_hal_amdgpu_host_signal_pool_t host_signal_pool;
// Default queue-allocation pool notification for this physical device.
iree_async_notification_t* default_pool_notification;
// Slab provider backing default and caller-created pools for this domain.
iree_hal_slab_provider_t* default_slab_provider;
// Host-local slab provider for mappable queue allocation transients.
iree_hal_slab_provider_t* default_host_slab_provider;
// TLSF options derived from device options and HSA memory-pool properties.
iree_hal_tlsf_pool_options_t default_pool_options;
// Routes default queue allocations to the best compatible memory pool.
iree_hal_pool_set_t default_pool_set;
// Frontier-aware suballocating pool used up to the TLSF slab length.
iree_hal_pool_t* default_pool;
// Direct per-allocation pool used for requests larger than one TLSF slab.
iree_hal_pool_t* default_oversized_pool;
// Frontier-aware suballocating pool for host-visible queue allocations.
iree_hal_pool_t* default_host_pool;
// Direct host-visible pool used for requests larger than one host TLSF slab.
iree_hal_pool_t* default_host_oversized_pool;
// Fixed-size staging pool for non-mappable queue_read/queue_write transfers.
iree_hal_amdgpu_staging_pool_t file_staging_pool;
// Builtin kernel table for this GPU agent.
iree_hal_amdgpu_device_kernels_t device_kernels;
// Host/device-neutral transfer context that points into |device_kernels|.
iree_hal_amdgpu_device_buffer_transfer_context_t buffer_transfer_context;
// Total number of host queue slots allocated in |host_queues|.
iree_host_size_t host_queue_capacity;
// Per-host-queue HSA AQL ring capacity in packets.
uint32_t host_queue_aql_capacity;
// Per-host-queue completion/reclaim ring capacity.
uint32_t host_queue_notification_capacity;
// Per-host-queue kernarg ring capacity in 64-byte blocks.
uint32_t host_queue_kernarg_capacity;
// Per-host-queue device-visible control upload ring capacity in bytes. Zero
// disables the optional upload ring.
uint32_t host_queue_upload_capacity;
// AMD vendor-packet capabilities selected from this GPU agent's ISA.
iree_hal_amdgpu_vendor_packet_capability_flags_t vendor_packet_capabilities;
// Hardware strategy selected for cross-queue epoch waits on this GPU agent.
iree_hal_amdgpu_wait_barrier_strategy_t wait_barrier_strategy;
// Number of live host queues initialized in |host_queues|.
iree_host_size_t host_queue_count;
// One or more host queues mapped to HSA queues on this physical device.
iree_hal_amdgpu_host_queue_t host_queues[/*host_queue_count*/];
} iree_hal_amdgpu_physical_device_t;
// Returns the aligned heap size in bytes required to store the physical device
// data structure. Requires that the options have been verified.
iree_host_size_t iree_hal_amdgpu_physical_device_calculate_size(
const iree_hal_amdgpu_physical_device_options_t* options);
// Initializes a physical device.
// Requires that the |options| have been verified.
//
// |out_physical_device| must reference at least
// iree_hal_amdgpu_physical_device_calculate_size of valid host memory.
iree_status_t iree_hal_amdgpu_physical_device_initialize(
iree_hal_device_t* logical_device, iree_hal_amdgpu_system_t* system,
const iree_hal_amdgpu_physical_device_options_t* options,
iree_async_proactor_t* proactor, iree_host_size_t host_ordinal,
const iree_hal_amdgpu_host_memory_pools_t* host_memory_pools,
iree_host_size_t device_ordinal, iree_allocator_t host_allocator,
iree_hal_amdgpu_physical_device_t* out_physical_device);
// Binds and initializes this physical device's host queues after the logical
// device has been assigned a topology/frontier.
iree_status_t iree_hal_amdgpu_physical_device_assign_frontier(
iree_hal_device_t* logical_device, iree_hal_amdgpu_system_t* system,
iree_async_proactor_t* proactor,
iree_async_frontier_tracker_t* frontier_tracker,
iree_async_axis_t base_axis,
iree_hal_amdgpu_epoch_signal_table_t* epoch_signal_table,
const iree_hal_amdgpu_host_memory_pools_t* host_memory_pools,
iree_allocator_t host_allocator,
iree_hal_amdgpu_physical_device_t* physical_device);
// Deinitializes any host queues initialized by assign_frontier.
void iree_hal_amdgpu_physical_device_deassign_frontier(
iree_hal_amdgpu_physical_device_t* physical_device);
// Enables or disables HSA dispatch timestamp population on all live queues.
//
// On enable failure, queues successfully enabled by this call are disabled
// before the status is returned. On disable failure, the function attempts all
// queues and joins failures.
iree_status_t iree_hal_amdgpu_physical_device_set_hsa_profiling_enabled(
iree_hal_amdgpu_physical_device_t* physical_device, bool enabled);
// Deinitializes a physical device and deallocates all device-specific
// resources.
void iree_hal_amdgpu_physical_device_deinitialize(
iree_hal_amdgpu_physical_device_t* physical_device);
// Releases any unused pooled resources.
iree_status_t iree_hal_amdgpu_physical_device_trim(
iree_hal_amdgpu_physical_device_t* physical_device);
#endif // IREE_HAL_DRIVERS_AMDGPU_PHYSICAL_DEVICE_H_