blob: 27577268fc8e6ddc334ca5ce01277731ad443ac4 [file] [log] [blame]
// Copyright 2020 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_HAL_LOCAL_EXECUTABLE_LIBRARY_H_
#define IREE_HAL_LOCAL_EXECUTABLE_LIBRARY_H_
// NOTE: this file is designed to be a standalone header: it is embedded in the
// compiler and must not take any dependencies on the runtime HAL code.
// Changes here will require changes to the compiler and must be versioned as if
// this was a schema: backwards-incompatible changes require version bumps or
// the ability to feature-detect at runtime.
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
//===----------------------------------------------------------------------===//
// Common utilities included to reduce dependencies
//===----------------------------------------------------------------------===//
// `restrict` keyword, not supported by some older compilers.
// We define our own macro in case dependencies use `restrict` differently.
#if defined(_MSC_VER) && _MSC_VER >= 1900
#define IREE_RESTRICT __restrict
#elif defined(_MSC_VER)
#define IREE_RESTRICT
#elif defined(__cplusplus)
#define IREE_RESTRICT __restrict__
#else
#define IREE_RESTRICT restrict
#endif // _MSC_VER
//===----------------------------------------------------------------------===//
// Runtime feature support metadata
//===----------------------------------------------------------------------===//
// Defines a bitfield of features that the library requires or supports.
enum iree_hal_executable_library_feature_bits_t {
IREE_HAL_EXECUTABLE_LIBRARY_FEATURE_NONE = 0u,
// TODO(benvanik): declare features for debugging/coverage/printf/etc.
// These will control which symbols are injected into the library at runtime.
};
typedef uint32_t iree_hal_executable_library_features_t;
// Defines a set of supported sanitizers that libraries may be compiled with.
// Loaders can use this declaration to check as to whether the library is
// compatible with the hosting environment for cases where the sanitizer
// requires host support.
typedef enum iree_hal_executable_library_sanitizer_kind_e {
IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_NONE = 0,
// Indicates the library is compiled to use AddressSanitizer:
// https://clang.llvm.org/docs/AddressSanitizer.html
// Equivalent compiler flag: -fsanitize=address
IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_ADDRESS = 1,
// Indicates the library is compiled to use MemorySanitizer:
// https://clang.llvm.org/docs/MemorySanitizer.html
// Equivalent compiler flag: -fsanitize=memory
IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_MEMORY = 2,
// Indicates the library is compiled to use ThreadSanitizer:
// https://clang.llvm.org/docs/ThreadSanitizer.html
// Equivalent compiler flag: -fsanitize=thread
IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_THREAD = 3,
// Indicates the library is compiled to use UndefinedBehaviorSanitizer:
// https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html
// Equivalent compiler flag: -fsanitize=undefined
IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_UNDEFINED = 4,
IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_MAX_ENUM = INT32_MAX,
} iree_hal_executable_library_sanitizer_kind_t;
//===----------------------------------------------------------------------===//
// Versioning and interface querying
//===----------------------------------------------------------------------===//
typedef struct iree_hal_executable_environment_v0_t
iree_hal_executable_environment_v0_t;
// Version code indicating the minimum required runtime structures.
// Runtimes cannot load executables with newer versions but may be able to load
// older versions if backward compatibility is enabled.
//
// NOTE: until we hit v1 the versioning scheme here is not set in stone.
// We may want to make this major release number, date codes (0x20220307),
// or some semantic versioning we track in whatever spec we end up having.
typedef uint32_t iree_hal_executable_library_version_t;
#define IREE_HAL_EXECUTABLE_LIBRARY_VERSION_0_3 0x00000003u
// The latest version of the library API; can be used to populate the
// iree_hal_executable_library_header_t::version when building libraries.
#define IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST \
IREE_HAL_EXECUTABLE_LIBRARY_VERSION_0_3
// A header present at the top of all versions of the library API used by the
// runtime to ensure version compatibility.
typedef struct iree_hal_executable_library_header_t {
// Version of the API this library was built with, which was likely the value
// of IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST.
iree_hal_executable_library_version_t version;
// Name used for logging/diagnostics.
const char* name;
// Bitfield of features required/supported by this executable.
iree_hal_executable_library_features_t features;
// Which sanitizer the library is compiled to use, if any.
// Libraries meant for use with a particular sanitizer will are only usable
// with hosting code that is using the same sanitizer.
iree_hal_executable_library_sanitizer_kind_t sanitizer;
} iree_hal_executable_library_header_t;
// Exported function from dynamic libraries for querying library information.
//
// The provided |max_version| is the maximum version the caller supports;
// callees must return NULL if their lowest available version is greater
// than the max version supported by the caller.
//
// The provided |environment| field contains information about the hosting
// execution environment that the executable may use to specialize its
// implementation, such as using specific imports or exporting
// architecture-specific dispatch routines. Some environmental properties may
// change per-invocation such as the CPU info when performing dispatches on
// heterogenous processors that may change over the lifetime of the program.
typedef const iree_hal_executable_library_header_t** (
*iree_hal_executable_library_query_fn_t)(
iree_hal_executable_library_version_t max_version,
const iree_hal_executable_environment_v0_t* environment);
// Function name exported from dynamic libraries (pass to dlsym).
#define IREE_HAL_EXECUTABLE_LIBRARY_EXPORT_NAME \
"iree_hal_executable_library_query"
//===----------------------------------------------------------------------===//
// IREE_HAL_EXECUTABLE_LIBRARY_VERSION_0_*
//===----------------------------------------------------------------------===//
// Function signature of imported functions for use in the executable.
// Each call takes opaque parameters as defined by the imported function.
// Both the compiler and the runtime must agree on the parameter format
// (including struct alignment and packing) and doing so is outside the scope
// of this API. In general one should only pass precisely what they need
// (pointers directly into buffers being manipulated, arguments, etc) and not
// try to replicate the dispatch structure (workgroup information and bindings)
// so that the imported functions can be versioned independently from this
// specification.
//
// Returns 0 on success and non-zero on failure. Failures will cause device loss
// and should only be used to communicate serious issues that should abort all
// execution within the current device. Buffer overflows are a good example of
// a useful failure though the HAL does not mandate that all overflows are
// caught and only that they are not harmful - clamping byte ranges and never
// returning a failure is sufficient.
typedef int (*iree_hal_executable_import_v0_t)(void* import_params);
// A thunk function used to call an import.
// All imports must be called through this function by passing the import
// function pointer as the first argument followed by the arguments of the
// import function itself.
typedef int (*iree_hal_executable_import_thunk_v0_t)(
iree_hal_executable_import_v0_t fn_ptr, void* import_params);
// Declares imports available to the executable library at runtime.
// To enable linker isolation, ABI shimming, and import multi-versioning we use
// this import table exclusively and do not allow platform-level linking. If it
// were allowed the deployment situation gets significantly more complex as the
// libraries containing the imported symbols will differ on all platforms, will
// have the platform-dependent ABI (Windows, MacOS, etc), and may not be
// available at all (bare-metal).
//
// Static libraries may choose to still dynamically link against external
// symbols without using this table as in that scenario much of the above
// concerns do not apply: all code is being linked together into the same binary
// and symbol availability is known during build-time linking. Static linking
// also enables LTO to strip any import not used by any executables in contrast
// to the dynamic style elsewhere.
//
// Represented as a struct-of-arrays for more efficient packing and more
// locality during lookup. Each subarray - when not omitted and NULL - is
// indexed by import ordinal and has up to |count| entries.
typedef struct iree_hal_executable_import_table_v0_t {
// Total number of imports in the table.
uint32_t count;
// Import symbol name encoding the name and whether it is weak.
// Example: `mylib_some_fn_v2?`
// `mylib_...`:
// Prefix indicating the owner of the function; symbols have a global
// namespace and this is used to reduce collisions.
// `some_fn...`:
// Name of the function used to link to the imports available in the
// hosting executable.
// `..._v2`:
// Function-specified version number used to allow multiple versions to
// to be imported. For backward compatibility one could import both
// `some_fn_v1?` and `some_fn_v2?` and use whichever is available.
// Note that this is just a convention for the suffix and can be anything.
// `?`:
// Indicates when an import is optional. If the import of the specified
// version is not found the table entry will be NULL. When omitted if the
// import is unavailable loading will fail.
//
// The symbol table is sorted ascending alphabetical (by strcmp).
const char* const* symbols;
} iree_hal_executable_import_table_v0_t;
// Maximum number of data fields in iree_hal_processor_v0_t.
#define IREE_HAL_PROCESSOR_DATA_CAPACITY_V0 8
// Architecture-specific CPU information available to executables.
// This encodes zero or more fields of opaque processor data.
// The intent is that this structure can be put in .rodata when there are no
// runtime features that need to be queried.
//
// The format of the data is architecture-specific as by construction no value
// will ever be used in a compiled binary from another architecture. This
// allows us to simplify this interface as we can't for example load the same
// executable library for both aarch64 on riscv32 and don't need to normalize
// any of the fields across them both.
//
// See iree/schemas/cpu_data.h for details.
typedef struct iree_hal_processor_v0_t {
// Opaque architecture-specific encoding in 64-bit words.
// This may represent a fixed-length data structure, a series of hardware
// registers, or key-value pairs.
//
// The contents are opaque here as to support out-of-tree architectures. The
// runtime code deriving the identifier/flags and providing it here is loosely
// coupled with the compiler code emitting checks based on the identifier and
// only those two places ever need to change.
uint64_t data[IREE_HAL_PROCESSOR_DATA_CAPACITY_V0];
} iree_hal_processor_v0_t;
static_assert(sizeof(iree_hal_processor_v0_t) % sizeof(uint64_t) == 0,
"8-byte alignment required");
// Defines the environment in which the executable is being used.
// Executables only have access to the information in this structure and must
// make all decisions based on it; this ensures executables are portable across
// operating environments (Linux, Mac, bare-metal, web, etc) by not having
// platform-specific syscalls and register query emulation.
typedef struct iree_hal_executable_environment_v0_t {
// Specialization constants available to the executable, if any.
// Contains as many as declared in the library header.
const uint32_t* constants;
// Thunk function for calling imports. All calls must be made through this.
iree_hal_executable_import_thunk_v0_t import_thunk;
// Optional imported functions available for use within the executable.
// Contains one entry per imported function. If an import was marked as weak
// then the corresponding entry may be NULL.
const iree_hal_executable_import_v0_t* imports;
// Optional architecture-specific CPU information.
// In heterogenous processors this may represent any of the subarchitecture
// types as it is derived from the core the calling thread is scheduled on.
// Will be all zeros if unavailable.
iree_hal_processor_v0_t processor;
} iree_hal_executable_environment_v0_t;
// Read-only per-dispatch state passed to each workgroup in a dispatch.
//
// We layout to try to fit everything commonly used into the first cache line
// (on archs with 64-bit pointers; 32-bit fits in a single line).
//
// For workgroup dimensions we allow the full 32-bit range on X and Y as those
// are the primary distribution dimensions. Z is the coarsest control and is
// usually in the 1-16 range; any higher and it can pessimize scheduling. Almost
// all GPUs also have this limitation (max Z of 65K) for the same reason.
typedef struct iree_hal_executable_dispatch_state_v0_t {
// Workgroup size chosen for the dispatch. For compilation modes where the
// workgroup size is constant this may be ignored.
uint32_t workgroup_size_x;
uint32_t workgroup_size_y;
uint16_t workgroup_size_z;
// Total number of available 4 byte push constant values in |push_constants|.
uint16_t push_constant_count;
// Total workgroup count for the dispatch. This is sourced from either the
// original dispatch call (for iree_hal_command_buffer_dispatch) or the
// indirection buffer (for iree_hal_command_buffer_dispatch_indirect).
uint32_t workgroup_count_x;
uint32_t workgroup_count_y;
uint16_t workgroup_count_z;
// Estimated maximum concurrent workgroups; loosely maps to the number of
// processors allowed to execute the dispatch. The actual number will vary
// based on competing dispatches and dynamic executor configuration.
uint8_t max_concurrency;
// Total number of binding base pointers in |binding_ptrs| and
// |binding_lengths|. The set is packed densely based on which bindings are
// used (known at compile-time).
uint8_t binding_count;
// |push_constant_count| values.
const uint32_t* push_constants;
// Base pointers to each binding buffer.
void* const* binding_ptrs;
// The length of each binding in bytes, 1:1 with |binding_ptrs|.
const size_t* binding_lengths;
// NOTE: the above fields are frequently accessed and should be kept together
// to ensure cache-friendly behavior. The first instructions every dispatch
// executes are loads from the fields and we want to avoid a cascade of
// cache misses. Less-frequently used fields can follow.
} iree_hal_executable_dispatch_state_v0_t;
static_assert(sizeof(iree_hal_executable_dispatch_state_v0_t) <= 64,
"try keeping dispatch state small enough to fit in a cache line");
// Read-only per-workgroup state passed to each workgroup in a dispatch.
//
// We layout to try to fit everything commonly used into the first cache line
// (on archs with 64-bit pointers; 32-bit fits in a single line).
typedef struct iree_hal_executable_workgroup_state_v0_t {
// Workgroup ID of the currently executing workgroup.
// This is in the range of 0-workgroup_count and each unique workgroup is to
// perform workgroup_size invocations.
uint32_t workgroup_id_x;
uint32_t workgroup_id_y;
uint16_t workgroup_id_z;
// Reserved for future use.
uint16_t reserved;
// Logical processor identifier used to index into processor info fields.
// Depending on the implementation this may be an ordinal, a bitfield, or an
// opaque unique identifier.
//
// NOTE: we could steal bits from the |processor_id| if needed; today the ID
// is the global ID but it really only needs to be within the current node
// (8-bits, or 16-bit for single-node thousand-core future proofing).
uint32_t processor_id;
// Scratch memory available for use by the workgroup.
// Requires a non-zero value to be specified for |local_memory_pages|; at
// least the size specified will be available. This memory is transient and
// exclusive to the workgroup. The provided pointer may be NULL if no
// workgroup local memory was requested.
void* local_memory;
// Total number of bytes available in |local_memory|. This may be larger than
// the requested amount.
uint32_t local_memory_size;
// +4 trailing bytes of free space
} iree_hal_executable_workgroup_state_v0_t;
static_assert(
sizeof(iree_hal_executable_workgroup_state_v0_t) <= 64,
"try keeping workgroup state small enough to fit in a cache line");
// Function signature of exported executable entry points.
// The same |environment| is passed to all dispatches.
// The same |dispatch_state| is passed to all workgroups within a dispatch.
// A unique |workgroup_state| is passed to every workgroup within a dispatch.
//
// Returns 0 on success and non-zero on failure. Failures will cause device loss
// and should only be used to communicate serious issues that should abort all
// execution within the current device. Buffer overflows are a good example of
// a useful failure though the HAL does not mandate that all overflows are
// caught and only that they are not harmful - clamping byte ranges and never
// returning a failure is sufficient.
typedef int (*iree_hal_executable_dispatch_v0_t)(
const iree_hal_executable_environment_v0_t* environment,
const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
const iree_hal_executable_workgroup_state_v0_t* workgroup_state);
// Bytes per page of workgroup local memory.
// This is chosen to match the common page size of devices.
#define IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE 4096
// Attributes for exported dispatch functions defining how they are to be
// executed. 0 defaults are well-specified and the entire attributes table may
// be omitted if no dispatch functions require these fields.
typedef struct iree_hal_executable_dispatch_attrs_v0_t {
// Number of IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE byte pages (or 0)
// indicating how much workgroup local memory is required for the dispatch.
// This is the size of the buffer referenced by the `local_memory` argument.
uint16_t local_memory_pages;
// Must be 0. May be used in the future for flags controlling the dispatch
// behavior/synchronization requirements.
uint16_t reserved;
} iree_hal_executable_dispatch_attrs_v0_t;
static_assert(sizeof(iree_hal_executable_dispatch_attrs_v0_t) == 4, "uint32_t");
// Source location information for a dispatch function indicating what code was
// used to generate it. This only represents a single source snapshot, of which
// there may be multiple valid possibilities (source program in Python, imported
// high level framework .mlir, LLVM bitcode, etc.).
typedef struct iree_hal_executable_src_loc_v0_t {
// The line within the file at |path|.
uint32_t line;
// The length of |path|.
uint32_t path_length;
// The path (absolute or relative) to the source file.
const char* path;
} iree_hal_executable_src_loc_v0_t;
// A table of exported functions arranged as a struct-of-arrays for more
// efficient packing and faster lookup. Each subarray - when not omitted and
// NULL - is indexed by export ordinal and has up to |count| entries.
typedef struct iree_hal_executable_export_table_v0_t {
// Total number of exports in the table.
uint32_t count;
// Function pointers for each exported entry point.
const iree_hal_executable_dispatch_v0_t* ptrs;
// Optional table of attributes 1:1 with ptrs.
// Omitting the table entirely means that no exports need workgroup local
// memory (or whatever else we pack into the attributes).
const iree_hal_executable_dispatch_attrs_v0_t* attrs;
// Optional table of export function entry point names 1:1 with ptrs.
// These names are only used for tracing/debugging and can be omitted to save
// binary size.
const char* const* names;
// Optional table of entry point tags 1:1 with ptrs.
// Used to describe the entry point in a human-readable format useful for
// verbose logging. The string values, when present, may be attached to
// tracing/debugging events related to the entry point.
const char* const* tags;
// Optional table of source locations 1:1 with ptrs.
const iree_hal_executable_src_loc_v0_t* src_locs;
} iree_hal_executable_export_table_v0_t;
// A table declaring the executable-level constants that can be used to
// specialize the executable behavior.
typedef struct iree_hal_executable_constant_table_v0_t {
// Total number of constants in the table.
uint32_t count;
// We could add more metadata here if we wanted to enable reflection.
} iree_hal_executable_constant_table_v0_t;
// Structure used for v0 library interfaces.
// The entire structure is designed to be read-only and able to live embedded in
// the binary .rdata section.
//
// The information held within the structure is not cached by the runtime.
// Implementations may choose to heap allocate this structure and modify its
// members at runtime so long as they observe the thread-safety guarantees.
// For example, a JIT may default all exports to JIT thunk functions and then
// atomically swap them out for the translated function pointers as they are
// available.
typedef struct iree_hal_executable_library_v0_t {
// Version/metadata header.
// Will have a version of IREE_HAL_EXECUTABLE_LIBRARY_VERSION_*.
const iree_hal_executable_library_header_t* header;
// Table of imported functions available to functions in the executable.
iree_hal_executable_import_table_v0_t imports;
// Table of exported functions from the executable.
iree_hal_executable_export_table_v0_t exports;
// Table of executable-level constants.
iree_hal_executable_constant_table_v0_t constants;
} iree_hal_executable_library_v0_t;
#endif // IREE_HAL_LOCAL_EXECUTABLE_LIBRARY_H_