blob: cbc722e178f90133366045c69855eb4f0704a2fe [file]
// Copyright 2025 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_HAL_DRIVERS_AMDGPU_UTIL_LIBHSA_H_
#define IREE_HAL_DRIVERS_AMDGPU_UTIL_LIBHSA_H_
#include "iree/base/api.h"
// HSA headers; note that these only somewhat assume shared library usage and
// though the functions are defined we cannot call them.
//
// TODO(benvanik): fork or upstream changes to the headers to make this safer?
// Similar to Vulkan we could have all the function declarations hidden and also
// include proper typed function pointers to aid in shared library use.
#include "third_party/hsa-runtime-headers/include/hsa/amd_hsa_queue.h" // IWYU pragma: export
#include "third_party/hsa-runtime-headers/include/hsa/amd_hsa_signal.h" // IWYU pragma: export
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h" // IWYU pragma: export
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h" // IWYU pragma: export
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ven_amd_loader.h" // IWYU pragma: export
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
typedef struct iree_dynamic_library_t iree_dynamic_library_t;
typedef struct iree_hal_amdgpu_libhsa_t iree_hal_amdgpu_libhsa_t;
//===----------------------------------------------------------------------===//
// Compile-time Configuration
//===----------------------------------------------------------------------===//
// By default we dynamically link against ROCR-Runtime. This allows us to keep
// our binary size small, version our runtime independently from the system
// software, and produce binaries that run on systems without ROCR-Runtime
// available.
//
// In the specific case of a user who is producing a binary that will only ever
// be run on a machine with an AMD GPU and up-to-date drivers we support
// statically linking against the installed ROCR-Runtime instead. This very
// slightly reduces overhead but is primarily useful for producing hermetic
// binaries.
//
// Set `-DIREE_HAL_AMDGPU_LIBHSA_STATIC=1` in CMake to link against the package.
// If already linking ROCR-Runtime into the top-level binary then add a compiler
// define `-DIREE_HAL_AMDGPU_LIBHSA_STATIC=1` to have this code emit imports
// (and then run LTO!).
#if !defined(IREE_HAL_AMDGPU_LIBHSA_STATIC)
#define IREE_HAL_AMDGPU_LIBHSA_STATIC 0
// For manual testing:
// #define IREE_HAL_AMDGPU_LIBHSA_STATIC 1
#endif // IREE_HAL_AMDGPU_LIBHSA_STATIC
#define IREE_HAL_AMDGPU_LIBHSA_TRACE_CATEGORY_ALWAYS (1 << 0)
#define IREE_HAL_AMDGPU_LIBHSA_TRACE_CATEGORY_SIGNALS (1 << 1)
#define IREE_HAL_AMDGPU_LIBHSA_TRACE_CATEGORY_QUEUES (1 << 2)
#define IREE_HAL_AMDGPU_LIBHSA_TRACE_CATEGORY_ALL \
(IREE_HAL_AMDGPU_LIBHSA_TRACE_CATEGORY_ALWAYS | \
IREE_HAL_AMDGPU_LIBHSA_TRACE_CATEGORY_SIGNALS | \
IREE_HAL_AMDGPU_LIBHSA_TRACE_CATEGORY_QUEUES)
// TODO(benvanik): expose as a more configurable setting - for now we try to
// stick to low-frequency events by default and have this for people debugging
// the implementation who want to see all calls.
#if !defined(IREE_HAL_AMDGPU_LIBHSA_TRACING_MODE)
#if IREE_TRACING_FEATURES && !defined(NDEBUG)
#define IREE_HAL_AMDGPU_LIBHSA_TRACING_MODE \
IREE_HAL_AMDGPU_LIBHSA_TRACE_CATEGORY_ALL
#else
#define IREE_HAL_AMDGPU_LIBHSA_TRACING_MODE 0
#endif // IREE_TRACING_FEATURES && !NDEBUG
#endif // IREE_HAL_AMDGPU_LIBHSA_TRACING_MODE
#if !defined(IREE_AMDGPU_DEVICE_PTR)
// Indicates a pointer is on the device. Used as annotations in host code.
// Device memory may not be host accessible or may have massive performance
// implications for accessing. This serves as a warning.
#define IREE_AMDGPU_DEVICE_PTR
#endif // IREE_AMDGPU_DEVICE_PTR
//===----------------------------------------------------------------------===//
// iree_hal_amdgpu_libhsa_t
//===----------------------------------------------------------------------===//
typedef uint32_t iree_hal_amdgpu_libhsa_flags_t;
enum iree_hal_amdgpu_libhsa_flag_bits_e {
IREE_HAL_AMDGPU_LIBHSA_FLAG_NONE = 0u,
};
// Dynamically loaded libhsa-runtime64.so (or equivalent).
// Contains function pointers to resolved HSA API symbols as well as extension
// tables when available.
//
// Upon loading the `hsa_init` routine will be called. If there are other
// existing loaded HSA instances this will increase the reference count. Upon
// destruction `hsa_shut_down` will be called to decrement the HSA reference
// count. Within a single process HSA _should_ (and maybe _must_) be loaded from
// the same exact dynamic library; it's ok for different HAL drivers to load HSA
// redundantly so long as it's the same implementation.
//
// Because HSA ref counts itself we avoid doing so here. Each copy of the loaded
// library retains a reference and cloning it will always retain another. The
// structure can be treated by-value so long as the appropriate copy method is
// used when cloning. This design reduces one level of indirection into HSA: the
// library can be embedded in the device structure and function pointer offsets
// can be statically calculated. Does this matter in the grand scheme of things?
// Probably not.
//
// Thread-safe; immutable.
typedef struct iree_hal_amdgpu_libhsa_t {
// True if hsa_init was called and hsa_shut_down must be called.
bool initialized;
#if !IREE_HAL_AMDGPU_LIBHSA_STATIC
// Loaded HSA dynamic library.
iree_dynamic_library_t* library;
#define IREE_HAL_AMDGPU_LIBHSA_PFN(trace_category, result_type, symbol, decl, \
args) \
result_type(HSA_API* symbol)(decl);
#define DECL(...) __VA_ARGS__
#include "iree/hal/drivers/amdgpu/util/libhsa_tables.h" // IWYU pragma: export
#endif // !IREE_HAL_AMDGPU_LIBHSA_STATIC
// HSA_EXTENSION_AMD_LOADER extension table.
// Always required even when statically linking.
hsa_ven_amd_loader_1_03_pfn_t amd_loader;
} iree_hal_amdgpu_libhsa_t;
// Initializes |out_libhsa| in-place with dynamically loaded HSA symbols.
// iree_hal_amdgpu_libhsa_deinitialize must be used to release the library
// resources. The populated structure is immutable once initialized but if
// copied the iree_hal_amdgpu_libhsa_copy API must be used (no memcpy!).
//
// |search_paths| will override the default library search paths and look for
// the canonical library file under each before falling back to the defaults.
// The `IREE_HAL_AMDGPU_LIBHSA_PATH` environment variable can also be set and
// will be checked after the explicitly provided search paths.
IREE_API_EXPORT iree_status_t iree_hal_amdgpu_libhsa_initialize(
iree_hal_amdgpu_libhsa_flags_t flags, iree_string_view_list_t search_paths,
iree_allocator_t host_allocator, iree_hal_amdgpu_libhsa_t* out_libhsa);
// Deinitializes |libhsa| by unloading the backing library. All function
// pointers will be invalidated, though be warned they may still be loaded and
// may appear to work (we can't reach out to clear cached pointers).
IREE_API_EXPORT void iree_hal_amdgpu_libhsa_deinitialize(
iree_hal_amdgpu_libhsa_t* libhsa);
// Copies all resolved symbols from |libhsa| to |out_libhsa| and retains the
// HSA library. The target is assumed uninitialized.
IREE_API_EXPORT iree_status_t
iree_hal_amdgpu_libhsa_copy(const iree_hal_amdgpu_libhsa_t* libhsa,
iree_hal_amdgpu_libhsa_t* out_libhsa);
// Appends the absolute path of the shared library or DLL providing the dynamic
// symbols.
IREE_API_EXPORT iree_status_t iree_hal_amdgpu_libhsa_append_path_to_builder(
const iree_hal_amdgpu_libhsa_t* libhsa, iree_string_builder_t* builder);
// Returns an IREE status with the given |hsa_status| message formatted with
// textual information. |message| is optional.
IREE_API_EXPORT iree_status_t iree_status_from_hsa_status(
const char* file, const uint32_t line, hsa_status_t hsa_status,
const char* symbol, const char* message);
//===----------------------------------------------------------------------===//
// HSA API Wrappers
//===----------------------------------------------------------------------===//
// Wraps an iree_hal_amdgpu_libhsa_t* for use with the API wrappers.
// All calls should either embed their file and line information directly (if a
// thunk) or use this macro.
//
// Example:
// iree_hal_amdgpu_libhsa_t* my_libhsa = ...;
// iree_hsa_agent_get_info(IREE_LIBHSA(my_libhsa), agent, info, ...);
// // ... expanded to ...
// iree_hsa_agent_get_info(my_libhsa, __FILE__, __LINE__, agent, info, ...);
#define IREE_LIBHSA(libhsa) (libhsa), __FILE__, __LINE__
#define IREE_HAL_AMDGPU_LIBHSA_PFN_hsa_status_t(trace_category, result_type, \
symbol, decl, ...) \
iree_status_t iree_##symbol( \
const iree_hal_amdgpu_libhsa_t* IREE_RESTRICT libhsa, const char* file, \
const uint32_t line _COMMA_DECL(decl));
#define IREE_HAL_AMDGPU_LIBHSA_PFN_result(trace_category, result_type, symbol, \
decl, ...) \
result_type iree_##symbol( \
const iree_hal_amdgpu_libhsa_t* IREE_RESTRICT libhsa, const char* file, \
const uint32_t line _COMMA_DECL(decl));
#define IREE_HAL_AMDGPU_LIBHSA_PFN_void IREE_HAL_AMDGPU_LIBHSA_PFN_result
#define IREE_HAL_AMDGPU_LIBHSA_PFN_uint32_t IREE_HAL_AMDGPU_LIBHSA_PFN_result
#define IREE_HAL_AMDGPU_LIBHSA_PFN_uint64_t IREE_HAL_AMDGPU_LIBHSA_PFN_result
#define IREE_HAL_AMDGPU_LIBHSA_PFN_hsa_signal_value_t \
IREE_HAL_AMDGPU_LIBHSA_PFN_result
#define IREE_HAL_AMDGPU_LIBHSA_PFN(trace_category, result_type, symbol, decl, \
...) \
IREE_HAL_AMDGPU_LIBHSA_PFN_##result_type(trace_category, result_type, \
symbol, DECL(decl), __VA_ARGS__)
#define DECL(...) __VA_ARGS__
#define _COMMA_DECL(...) __VA_OPT__(, ) __VA_ARGS__
#include "iree/hal/drivers/amdgpu/util/libhsa_tables.h" // IWYU pragma: export
#undef _COMMA_DECL
#undef IREE_HAL_AMDGPU_LIBHSA_PFN_hsa_status_t
#undef IREE_HAL_AMDGPU_LIBHSA_PFN_result
#undef IREE_HAL_AMDGPU_LIBHSA_PFN_void
#undef IREE_HAL_AMDGPU_LIBHSA_PFN_uint32_t
#undef IREE_HAL_AMDGPU_LIBHSA_PFN_uint64_t
#undef IREE_HAL_AMDGPU_LIBHSA_PFN_hsa_signal_value_t
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // IREE_HAL_DRIVERS_AMDGPU_UTIL_LIBHSA_H_