blob: 66782fb8ec342147a312333fc49aaff1a4cdfedf [file]
// Copyright 2025 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_HAL_DRIVERS_AMDGPU_UTIL_VMEM_H_
#define IREE_HAL_DRIVERS_AMDGPU_UTIL_VMEM_H_
#include "iree/base/api.h"
#include "iree/hal/drivers/amdgpu/util/libhsa.h"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
typedef struct iree_hal_amdgpu_topology_t iree_hal_amdgpu_topology_t;
//===----------------------------------------------------------------------===//
// Virtual Memory Utilities
//===----------------------------------------------------------------------===//
// Semantically defines how a vmem allocation can be accessed.
typedef enum iree_hal_amdgpu_vmem_access_mode_e {
// All agents may produce and consume the memory. Read/write for all agents.
IREE_HAL_AMDGPU_ACCESS_MODE_SHARED = 0u,
// Memory is accessed exclusively by the agent it is allocated on.
// No other agent has access. Read/write for agent only.
IREE_HAL_AMDGPU_ACCESS_MODE_EXCLUSIVE,
// Memory is consumed exclusively by the agent it is allocated on but may be
// produced from any agent. This is useful for mailboxes. Read for agent only
// and write for all agents.
IREE_HAL_AMDGPU_ACCESS_MODE_EXCLUSIVE_CONSUMER,
// Memory is produced exclusively by the agent it is allocated on but may be
// consumed from any agent. This is useful for outbound buffers. Write for
// agent only and read for all agents.
IREE_HAL_AMDGPU_ACCESS_MODE_EXCLUSIVE_PRODUCER,
} iree_hal_amdgpu_vmem_access_mode_t;
// Selects the HSA vmem allocation type for a ringbuffer's backing memory.
//
// AMD's HSA extension exposes this as hsa_amd_memory_type_t with bare
// MEMORY_TYPE_* enumerants; this local enum keeps that upstream namespace leak
// out of the rest of the driver.
typedef enum iree_hal_amdgpu_vmem_memory_type_e {
// Default vmem allocation mode for device-local pools.
IREE_HAL_AMDGPU_VMEM_MEMORY_TYPE_DEFAULT = 0,
// Pinned host allocation mode for CPU memory pools.
IREE_HAL_AMDGPU_VMEM_MEMORY_TYPE_PINNED_HOST = 1,
} iree_hal_amdgpu_vmem_memory_type_t;
// Finds a global memory pool on the |agent| matching any of the specified
// global flags.
iree_status_t iree_hal_amdgpu_find_global_memory_pool(
const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t agent,
hsa_amd_memory_pool_global_flag_t match_flags,
hsa_amd_memory_pool_t* out_pool);
// Finds a coarse-grained memory pool on the |agent|.
// The returned pool will support allocations and be
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED.
iree_status_t iree_hal_amdgpu_find_coarse_global_memory_pool(
const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t agent,
hsa_amd_memory_pool_t* out_pool);
// Finds a fine-grained memory pool on the |agent|.
// The returned pool will support allocations and be either
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED or
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED.
iree_status_t iree_hal_amdgpu_find_fine_global_memory_pool(
const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t agent,
hsa_amd_memory_pool_t* out_pool);
// Tries to find a coarse-grained memory pool on the |agent|.
// Returns true and populates |out_pool| if found, false otherwise.
bool iree_hal_amdgpu_try_find_coarse_global_memory_pool(
const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t agent,
hsa_amd_memory_pool_t* out_pool);
// Tries to find a fine-grained memory pool on the |agent|.
// Returns true and populates |out_pool| if found, false otherwise.
bool iree_hal_amdgpu_try_find_fine_global_memory_pool(
const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t agent,
hsa_amd_memory_pool_t* out_pool);
//===----------------------------------------------------------------------===//
// iree_hal_amdgpu_vmem_ringbuffer_t
//===----------------------------------------------------------------------===//
// An allocated ringbuffer using virtual memory mapping to present a contiguous
// virtual address range that is backed by a single physical buffer but that
// allows access before and after it.
//
// This presents as a ringbuffer that does not need any special logic for
// wrapping from base offsets used when copying in memory. It follows the
// approach documented in https://lo.calho.st/posts/black-magic-buffer/ and
// https://www.mikeash.com/pyblog/friday-qa-2012-02-17-ring-buffers-and-mirrored-memory-part-ii.html
// of virtual memory mapping the buffer multiple times, example code:
// https://github.com/google/wuffs/blob/main/script/mmap-ring-buffer.c
//
// We use SVM to allocate the physical memory of the ringbuffer and then stitch
// together 3 virtual memory ranges in one contiguous virtual allocation that
// aliases the physical allocation. By treating the middle range as the base
// buffer pointer we are then able to freely dereference both before and after
// the base pointer by up to the ringbuffer size in length.
// physical: <ringbuffer size> --+------+------+
// v v v
// virtual: [prev] [base] [next]
// ^ ^
// | +-- ring_base_ptr
// +--------- va_base_ptr
typedef struct iree_hal_amdgpu_vmem_ringbuffer_t {
// Capacity of the ringbuffer in bytes.
// May be larger than the requested size if adjusted to the minimum allocation
// granule.
iree_device_size_t capacity;
// Physical allocation of the pinned ringbuffer memory.
// This is sized to the requested capacity of the ringbuffer.
hsa_amd_vmem_alloc_handle_t alloc_handle;
// Base virtual address pointer of the ringbuffer. This is the start of the
// reserved address range.
IREE_AMDGPU_DEVICE_PTR void* va_base_ptr;
// Base virtual address pointer of the central ringbuffer contents.
IREE_AMDGPU_DEVICE_PTR void* ring_base_ptr;
} iree_hal_amdgpu_vmem_ringbuffer_t;
// Initializes a ringbuffer by allocating the physical and virtual memory of at
// least the requested |min_capacity| with at least 64 byte alignment.
// |memory_type| selects the HSA allocation mode for the selected pool; callers
// allocating from host CPU pools should use
// IREE_HAL_AMDGPU_VMEM_MEMORY_TYPE_PINNED_HOST while device-local pools
// generally use IREE_HAL_AMDGPU_VMEM_MEMORY_TYPE_DEFAULT.
// |access_descs| will be used to setup accessibility.
iree_status_t iree_hal_amdgpu_vmem_ringbuffer_initialize(
const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t local_agent,
hsa_amd_memory_pool_t memory_pool,
iree_hal_amdgpu_vmem_memory_type_t memory_type,
iree_device_size_t min_capacity, iree_host_size_t access_desc_count,
const hsa_amd_memory_access_desc_t* access_descs,
iree_hal_amdgpu_vmem_ringbuffer_t* out_ringbuffer);
// Initializes a ringbuffer by allocating the physical and virtual memory of at
// least the requested power-of-two |min_capacity| with at least
// least 64 byte alignment. |topology| and |access_mode| will be used to setup
// accessibility.
iree_status_t iree_hal_amdgpu_vmem_ringbuffer_initialize_with_topology(
const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t local_agent,
hsa_amd_memory_pool_t memory_pool,
iree_hal_amdgpu_vmem_memory_type_t memory_type,
iree_device_size_t min_capacity, const iree_hal_amdgpu_topology_t* topology,
iree_hal_amdgpu_vmem_access_mode_t access_mode,
iree_hal_amdgpu_vmem_ringbuffer_t* out_ringbuffer);
// Deinitializes a ringbuffer and frees all physical and virtual allocations.
void iree_hal_amdgpu_vmem_ringbuffer_deinitialize(
const iree_hal_amdgpu_libhsa_t* libhsa,
iree_hal_amdgpu_vmem_ringbuffer_t* ringbuffer);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // IREE_HAL_DRIVERS_AMDGPU_UTIL_VMEM_H_