| // Copyright 2020 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #ifndef IREE_HAL_COMMAND_BUFFER_H_ |
| #define IREE_HAL_COMMAND_BUFFER_H_ |
| |
| #include <stdbool.h> |
| #include <stdint.h> |
| |
| #include "iree/base/api.h" |
| #include "iree/hal/allocator.h" |
| #include "iree/hal/buffer.h" |
| #include "iree/hal/channel.h" |
| #include "iree/hal/event.h" |
| #include "iree/hal/executable.h" |
| #include "iree/hal/queue.h" |
| #include "iree/hal/resource.h" |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif // __cplusplus |
| |
| typedef struct iree_hal_device_t iree_hal_device_t; |
| |
| //===----------------------------------------------------------------------===// |
| // Types and Enums |
| //===----------------------------------------------------------------------===// |
| |
| // A bitfield specifying the mode of operation for a command buffer. |
| enum iree_hal_command_buffer_mode_bits_t { |
| IREE_HAL_COMMAND_BUFFER_MODE_DEFAULT = 0u, |
| |
| // Command buffer will be submitted once and never used again. |
| // This may enable in-place patching of command buffers that reduce overhead |
| // when it's known that command buffers will not be reused. |
| // If this bit is not set the command buffer may be submitted multiple times. |
| IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT = 1u << 0, |
| |
| // Indicates that the command buffer execution is allowed to execute inline |
| // with recording. The exact execution behavior is unspecified by the API and |
| // intentionally unknowable and must always assume to happen entirely |
| // asynchronously and that it will only have completed after waiting on device |
| // idle or the wait semaphores specified in the submission are signaled. |
| // |
| // Local backends can use this to avoid recording when the calling program can |
| // guarantee that it makes no assumptions about execution being deferred until |
| // a submission. The command buffer must still be submitted for scheduling and |
| // must have no wait semaphores specified. This allows the same program code |
| // to execute work both synchronously and asynchronously as remote backends |
| // are allowed to ignore this. |
| // |
| // Remote backends can use this to flush the command buffer more aggressively |
| // to begin early execution and overlap with continued recording. |
| // |
| // Requires IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT. |
| IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION = 1u << 4, |
| |
| // Disables additional command buffer validation (if present). |
| // By default all command buffers will be validated if |
| // `IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE=1` - if shimming command buffers |
| // or performing replay this validation can be disabled per-command buffer. |
| IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED = 1u << 5, |
| }; |
| typedef uint32_t iree_hal_command_buffer_mode_t; |
| |
| // A bitfield specifying the category of commands in a command queue. |
| enum iree_hal_command_category_bits_t { |
| // Command is considered a transfer operation (memcpy, etc). |
| IREE_HAL_COMMAND_CATEGORY_TRANSFER = 1u << 0, |
| // Command is considered a dispatch operation (dispatch/execute). |
| IREE_HAL_COMMAND_CATEGORY_DISPATCH = 1u << 1, |
| // Commands may be of any type. |
| // Using this value may prevent optimizations and if possible callers should |
| // always specify the strictest set possible (for example, only transfer |
| // commands to ensure they get placed on a DMA queue). |
| IREE_HAL_COMMAND_CATEGORY_ANY = |
| IREE_HAL_COMMAND_CATEGORY_TRANSFER | IREE_HAL_COMMAND_CATEGORY_DISPATCH, |
| }; |
| typedef uint32_t iree_hal_command_category_t; |
| |
| // Specifies a direct or indirect buffer binding. |
| // The range specified by [offset, length) of either the specified buffer or |
| // a buffer slot in the binding table will be used at the time the command is |
| // executed. |
| // |
| // The IREE HAL buffer type may internally be offset; such offset is applied |
| // here as if it were the base address of the buffer. Note that the offset will |
| // be applied at the time the binding is recording into the command buffer. |
| // |
| // Roughly maps to VkDescriptorSetBinding. |
| typedef struct iree_hal_buffer_ref_t { |
| // Currently unused and should be 0. |
| uint32_t reserved : 8; |
| // Binding table slot the buffer will be sourced from if buffer is NULL. |
| // Only valid on command buffers that support indirect execution. |
| uint32_t buffer_slot : 24; |
| // Buffer bound to the binding number. |
| // If NULL then the buffer_slot will be used to resolve the buffer at command |
| // buffer execution time from the binding table. |
| iree_hal_buffer_t* buffer; |
| // Offset, in bytes, into the buffer that the binding starts at. |
| // When indirectly referencing a binding table buffer this will be added to |
| // the base offset of the bound buffer. |
| iree_device_size_t offset; |
| // Length, in bytes, of the buffer after the offset that is accessed. |
| // This can be IREE_WHOLE_BUFFER, however note that if the entire buffer |
| // contents are larger than supported by the device (~128MiB, usually) this |
| // will fail. |
| iree_device_size_t length; |
| } iree_hal_buffer_ref_t; |
| |
| static inline iree_hal_buffer_ref_t iree_hal_make_buffer_ref( |
| iree_hal_buffer_t* buffer, iree_device_size_t offset, |
| iree_device_size_t length) { |
| return (iree_hal_buffer_ref_t){0, 0, buffer, offset, length}; |
| } |
| |
| static inline iree_hal_buffer_ref_t iree_hal_make_indirect_buffer_ref( |
| uint32_t buffer_slot, iree_device_size_t offset, |
| iree_device_size_t length) { |
| return (iree_hal_buffer_ref_t){0, buffer_slot, NULL, offset, length}; |
| } |
| |
| // A list of buffer references. |
| typedef struct iree_hal_buffer_ref_list_t { |
| iree_host_size_t count; |
| const iree_hal_buffer_ref_t* values; |
| } iree_hal_buffer_ref_list_t; |
| |
| // Bitfield specifying which execution stage a barrier should start/end at. |
| // |
| // Maps to VkPipelineStageFlagBits. |
| enum iree_hal_execution_stage_bits_t { |
| // Top of the pipeline when commands are initially issued by the device. |
| IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE = 1u << 0, |
| // Stage of the pipeline when dispatch parameter data is consumed. |
| IREE_HAL_EXECUTION_STAGE_COMMAND_PROCESS = 1u << 1, |
| // Stage where dispatch commands execute. |
| IREE_HAL_EXECUTION_STAGE_DISPATCH = 1u << 2, |
| // Stage where transfer (copy/clear/fill/etc) commands execute. |
| IREE_HAL_EXECUTION_STAGE_TRANSFER = 1u << 3, |
| // Final stage in the pipeline when commands are retired on the device. |
| IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE = 1u << 4, |
| // Pseudo-stage for read/writes by the host. Not executed on device. |
| IREE_HAL_EXECUTION_STAGE_HOST = 1u << 5, |
| }; |
| typedef uint32_t iree_hal_execution_stage_t; |
| |
| // Bitfield specifying flags controlling an execution dependency. |
| // |
| // Maps to VkDependencyFlags. |
| enum iree_hal_execution_barrier_flag_bits_t { |
| IREE_HAL_EXECUTION_BARRIER_FLAG_NONE = 0, |
| }; |
| typedef uint32_t iree_hal_execution_barrier_flags_t; |
| |
| // Bitfield specifying which scopes will access memory and how. |
| // |
| // Maps to VkAccessFlagBits. |
| enum iree_hal_access_scope_bits_t { |
| // Read access to indirect command data as part of an indirect dispatch. |
| IREE_HAL_ACCESS_SCOPE_INDIRECT_COMMAND_READ = 1u << 0, |
| // Constant uniform buffer reads by the device. |
| IREE_HAL_ACCESS_SCOPE_CONSTANT_READ = 1u << 1, |
| // Storage buffer reads by dispatch commands. |
| IREE_HAL_ACCESS_SCOPE_DISPATCH_READ = 1u << 2, |
| // Storage buffer writes by dispatch commands. |
| IREE_HAL_ACCESS_SCOPE_DISPATCH_WRITE = 1u << 3, |
| // Source of a transfer operation. |
| IREE_HAL_ACCESS_SCOPE_TRANSFER_READ = 1u << 4, |
| // Target of a transfer operation. |
| IREE_HAL_ACCESS_SCOPE_TRANSFER_WRITE = 1u << 5, |
| // Read operation by the host through mapped memory. |
| IREE_HAL_ACCESS_SCOPE_HOST_READ = 1u << 6, |
| // Write operation by the host through mapped memory. |
| IREE_HAL_ACCESS_SCOPE_HOST_WRITE = 1u << 7, |
| // External/non-specific read. |
| IREE_HAL_ACCESS_SCOPE_MEMORY_READ = 1u << 8, |
| // External/non-specific write. |
| IREE_HAL_ACCESS_SCOPE_MEMORY_WRITE = 1u << 9, |
| }; |
| typedef uint32_t iree_hal_access_scope_t; |
| |
| // Defines a global memory barrier. |
| // These are cheaper to encode than buffer-specific barriers but may cause |
| // stalls and bubbles in device pipelines if applied too broadly. Prefer them |
| // over equivalently large sets of buffer-specific barriers (such as when |
| // completely changing execution contexts). |
| // |
| // Maps to VkMemoryBarrier. |
| typedef struct iree_hal_memory_barrier_t { |
| // All access scopes prior-to the barrier (inclusive). |
| iree_hal_access_scope_t source_scope; |
| // All access scopes following the barrier (inclusive). |
| iree_hal_access_scope_t target_scope; |
| } iree_hal_memory_barrier_t; |
| |
| // Defines a memory barrier that applies to a range of a specific buffer. |
| // Use of these (vs. global memory barriers) provides fine-grained execution |
| // ordering to device command processors and allows for more aggressive |
| // reordering. |
| // |
| // Maps to VkBufferMemoryBarrier. |
| typedef struct iree_hal_buffer_barrier_t { |
| // All access scopes prior-to the barrier (inclusive). |
| iree_hal_access_scope_t source_scope; |
| // All access scopes following the barrier (inclusive). |
| iree_hal_access_scope_t target_scope; |
| // Buffer the barrier is restricted to. |
| // The barrier will apply to the entire physical device allocation. |
| iree_hal_buffer_ref_t buffer_ref; |
| } iree_hal_buffer_barrier_t; |
| |
| // Bitfield indicating advice for implementations managing a buffer. |
| typedef uint64_t iree_hal_memory_advise_flags_t; |
| enum iree_hal_memory_advise_flag_bits_t { |
| IREE_HAL_MEMORY_ADVISE_FLAG_NONE = 0, |
| // TODO(benvanik): cache control operations (invalidate/flush). arg0/arg1 |
| // could source/target queue affinities. |
| // TODO(benvanik): prefetch and access type hints. |
| // TODO(benvanik): ASAN hints (protect/unprotect). |
| }; |
| |
| // Bitfield specifying flags controlling a fill operation. |
| typedef uint64_t iree_hal_fill_flags_t; |
| enum iree_hal_fill_flag_bits_t { |
| IREE_HAL_FILL_FLAG_NONE = 0, |
| }; |
| |
| // Bitfield specifying flags controlling an update operation. |
| typedef uint64_t iree_hal_update_flags_t; |
| enum iree_hal_update_flag_bits_t { |
| IREE_HAL_UPDATE_FLAG_NONE = 0, |
| }; |
| |
| // Bitfield specifying flags controlling a copy operation. |
| typedef uint64_t iree_hal_copy_flags_t; |
| enum iree_hal_copy_flag_bits_t { |
| IREE_HAL_COPY_FLAG_NONE = 0, |
| }; |
| |
| // Specifies the type of collective operation. |
| enum iree_hal_collective_kind_e { |
| // Gathers N*|element_count| elements of the specified type in |recv_binding| |
| // by sourcing |element_count| elements from the |send_binding| of each rank |
| // and concatenating them. |
| // |
| // |param|: unused |
| // |send_binding|: local elements to add at offset rank |
| // |recv_binding|: concatenated results from all ranks |
| // In-place: |send_binding| == |recv_binding| + rank * |element_count| |
| // Equivalent to: |
| // ncclAllGather |
| IREE_HAL_COLLECTIVE_KIND_ALL_GATHER = 0u, |
| |
| // Reduces |element_count| elements of the specified type in |send_binding| |
| // using the specified reduction operation and places identical copies of the |
| // result in each |recv_binding|. |
| // |
| // |param|: unused |
| // |send_binding|: local elements to reduce |
| // |recv_binding|: copy of the reduction results |
| // In-place: |send_binding| == |recv_binding| |
| // Equivalent to: |
| // ncclAllReduce |
| IREE_HAL_COLLECTIVE_KIND_ALL_REDUCE, |
| |
| // Gathers |element_count| elements of the specified type in |recv_binding| by |
| // sourcing N parts of |element_count|/N elements, one from the |send_binding| |
| // of each rank, and concatenating them. |
| // |
| // |param|: unused |
| // |send_binding|: local elements to split and send to all ranks |
| // |recv_binding|: concatenated results from all ranks |
| IREE_HAL_COLLECTIVE_KIND_ALL_TO_ALL, |
| |
| // Copies |element_count| elements of the specified type from |send_binding| |
| // on the specified rank |param| to all other ranks |recv_binding|s. |
| // |
| // |param|: source rank of the broadcast value |
| // |send_binding|: only used on the source rank |
| // |recv_binding|: only used on non-source ranks |
| // In-place: |send_binding| == |recv_binding| |
| // Equivalent to: |
| // ncclBroadcast |
| IREE_HAL_COLLECTIVE_KIND_BROADCAST, |
| |
| // Reduces |element_count| elements of the specified type in |send_binding| |
| // using the specified reduction operation and places the results in the |
| // |recv_binding| of the target rank |param|. |
| // |
| // |param|: target rank of the resulting value |
| // |send_binding|: used on all ranks |
| // |recv_binding|: only used on the target rank |
| // In-place: |send_binding| == |recv_binding| |
| // Equivalent to: |
| // ncclReduce |
| IREE_HAL_COLLECTIVE_KIND_REDUCE, |
| |
| // Reduce |element_count| elements of the specified type in |send_binding| |
| // from all ranks using the specified reduction operation and scatters the |
| // reduced results over the ranks such that the |recv_binding| on rank i |
| // will contain the i-th block of the results. |
| // |
| // |param|: unused |
| // |send_binding|: used on all ranks |
| // |recv_binding|: partial results for the hosting rank |
| // In-place: |recv_binding| == |send_binding| + rank * |element_count| |
| // Equivalent to: |
| // ncclReduceScatter |
| IREE_HAL_COLLECTIVE_KIND_REDUCE_SCATTER, |
| |
| // Sends |element_count| elements of the specified type in |send_binding| to |
| // the target rank |param|. |
| // |
| // |param|: target performing a IREE_HAL_COLLECTIVE_KIND_RECV |
| // |send_binding|: used on source |
| // |recv_binding|: unused |
| // Equivalent to: |
| // ncclSend |
| IREE_HAL_COLLECTIVE_KIND_SEND, |
| |
| // Receives |element_count| elements of the specified type in |recv_binding| |
| // from source rank |param|. |
| // |
| // |param|: source performing a IREE_HAL_COLLECTIVE_KIND_SEND |
| // |send_binding|: unused |
| // |recv_binding|: used on target |
| // Equivalent to: |
| // ncclRecv |
| IREE_HAL_COLLECTIVE_KIND_RECV, |
| |
| // |param| is used to store the target rank in the low 16 bits, and the source |
| // rank in the high 16 bits. Sends |element_count| elements of the specified |
| // type in |send_binding| the target rank, unless it is -1. Receives |
| // |element_count| elements of the specified type in |recv_binding| from |
| // source rank, unless it is -1, then the result will be all zeros. |
| // |
| // |param|: first 16 bits are the target, last 16 bits are the source |
| // |send_binding|: used on source |
| // |recv_binding|: used on target |
| IREE_HAL_COLLECTIVE_KIND_SEND_RECV, |
| |
| // Maximum enumeration value for collective operations. |
| IREE_HAL_COLLECTIVE_KIND_MAX_VALUE = IREE_HAL_COLLECTIVE_KIND_SEND_RECV, |
| }; |
| typedef uint8_t iree_hal_collective_kind_t; |
| |
| // Specifies the reduction operator of a collective reduction operation. |
| enum iree_hal_collective_reduction_e { |
| // Specifies that the reduction operation is unspecified. |
| IREE_HAL_COLLECTIVE_REDUCTION_NONE = 0, |
| // Specifies that the reduction operation computes a sum (addition). |
| IREE_HAL_COLLECTIVE_REDUCTION_SUM = 1, |
| // Specifies that the reduction operation computes a product (multiplication). |
| IREE_HAL_COLLECTIVE_REDUCTION_PRODUCT, |
| // Specifies that the reduction operation computes a minimum (min). |
| IREE_HAL_COLLECTIVE_REDUCTION_MINIMUM, |
| // Specifies that the reduction operation computes a maximum (max). |
| IREE_HAL_COLLECTIVE_REDUCTION_MAXIMUM, |
| // Specifies that the reduction operation computes an average (avg). |
| IREE_HAL_COLLECTIVE_REDUCTION_AVERAGE, |
| // Maximum enumeration value for reduction types. |
| IREE_HAL_COLLECTIVE_REDUCTION_MAX_VALUE = |
| IREE_HAL_COLLECTIVE_REDUCTION_AVERAGE, |
| }; |
| typedef uint8_t iree_hal_collective_reduction_t; |
| |
| // Specifies the element type as processed by a collective operation. |
| // Note that these types are a much restricted set compared to |
| // iree_hal_element_type_t as most collective compute libraries only expose a |
| // limited number of primitives as some may be backed by fixed-function |
| // hardware. |
| enum iree_hal_collective_element_type_e { |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_SINT_8 = 0, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_UINT_8, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_SINT_16, // not commonly implemented |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_UINT_16, // not commonly implemented |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_SINT_32, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_UINT_32, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_SINT_64, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_UINT_64, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_FLOAT_16, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_FLOAT_32, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_FLOAT_64, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_BFLOAT_16, |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_MAX_VALUE = |
| IREE_HAL_COLLECTIVE_ELEMENT_TYPE_BFLOAT_16, |
| }; |
| typedef uint8_t iree_hal_collective_element_type_t; |
| |
| // Describes a collective operation. |
| typedef union { |
| uint32_t packed; // packed value |
| struct { |
| // Collective operation. |
| iree_hal_collective_kind_t kind; |
| // Reduction type (for reduction ops). |
| iree_hal_collective_reduction_t reduction; |
| // Element type. |
| iree_hal_collective_element_type_t element_type; |
| // Reserved for future use. |
| uint8_t reserved; |
| }; |
| } iree_hal_collective_op_t; |
| static_assert(sizeof(iree_hal_collective_op_t) == sizeof(uint32_t), |
| "must pack"); |
| |
| // Writes a string description of |op| to the |out_temp| storage and returns |
| // a string view into the storage of the resulting value. |
| IREE_API_EXPORT iree_string_view_t iree_hal_collective_op_format( |
| const iree_hal_collective_op_t* op, iree_bitfield_string_temp_t* out_temp); |
| |
| // Returns the number of bytes each |element_type| consumes in memory. |
| IREE_API_EXPORT iree_device_size_t iree_hal_collective_element_byte_count( |
| iree_hal_collective_element_type_t element_type); |
| |
| // Bitfield specifying flags controlling a dispatch operation. |
| typedef uint64_t iree_hal_dispatch_flags_t; |
| enum iree_hal_dispatch_flag_bits_t { |
| IREE_HAL_DISPATCH_FLAG_NONE = 0, |
| }; |
| |
| // An RGBA color. |
| typedef struct iree_hal_label_color_t { |
| uint8_t r; |
| uint8_t g; |
| uint8_t b; |
| uint8_t a; |
| } iree_hal_label_color_t; |
| |
| // A source location attached to debug labels. |
| typedef struct iree_hal_label_location_t { |
| iree_string_view_t file; |
| int line; |
| } iree_hal_label_location_t; |
| |
| // An unspecified color; debugging tools are to choose their own. |
| static inline iree_hal_label_color_t iree_hal_label_color_unspecified() { |
| iree_hal_label_color_t color = {0, 0, 0, 0}; |
| return color; |
| } |
| |
| // Formats a command buffer mode bitfield as a string. |
| // See iree_bitfield_format for usage. |
| IREE_API_EXPORT iree_string_view_t |
| iree_hal_command_buffer_mode_format(iree_hal_command_buffer_mode_t value, |
| iree_bitfield_string_temp_t* out_temp); |
| |
| // Formats a command category bitfield as a string. |
| // See iree_bitfield_format for usage. |
| IREE_API_EXPORT iree_string_view_t iree_hal_command_category_format( |
| iree_hal_command_category_t value, iree_bitfield_string_temp_t* out_temp); |
| |
| // Maximum size of any update in iree_hal_command_buffer_update_buffer. |
| // 64KB is the limit on Vulkan and we uniformly use that today across all |
| // targets as to not need too much command buffer memory. |
| #define IREE_HAL_COMMAND_BUFFER_MAX_UPDATE_SIZE \ |
| ((iree_device_size_t)(64 * 1024)) |
| |
| //===----------------------------------------------------------------------===// |
| // iree_hal_buffer_binding_table_t |
| //===----------------------------------------------------------------------===// |
| |
| // Describes a subrange of a buffer that can be bound to a binding slot. |
| typedef struct iree_hal_buffer_binding_t { |
| // Buffer being bound to the slot, if any. |
| iree_hal_buffer_t* buffer; |
| // Offset, in bytes, into the buffer that the binding starts at. |
| // This will be added to the offset specified on each usage of the slot. |
| iree_device_size_t offset; |
| // Length, in bytes, of the buffer that is available to the executable. |
| // This can be IREE_WHOLE_BUFFER, however note that if the entire buffer |
| // contents are larger than supported by the device (~128MiB, usually) this |
| // will fail. If the descriptor type is dynamic this will be used for all |
| // ranges regardless of offset. |
| iree_device_size_t length; |
| } iree_hal_buffer_binding_t; |
| |
| typedef struct iree_hal_buffer_binding_table_t { |
| iree_host_size_t count; |
| const iree_hal_buffer_binding_t* bindings; |
| } iree_hal_buffer_binding_table_t; |
| |
| static inline iree_hal_buffer_binding_table_t |
| iree_hal_buffer_binding_table_empty(void) { |
| iree_hal_buffer_binding_table_t table = {0, NULL}; |
| return table; |
| } |
| |
| static inline bool iree_hal_buffer_binding_table_is_empty( |
| iree_hal_buffer_binding_table_t binding_table) { |
| return binding_table.count == 0; |
| } |
| |
| // Returns an unretained buffer specified in |buffer_ref| or from |
| // |binding_table| with the slot specified if indirect. If the caller needs to |
| // preserve the buffer for longer than the (known) lifetime of the binding table |
| // then it must be retained or added to a resource set. |
| static inline iree_status_t iree_hal_buffer_binding_table_resolve_ref( |
| iree_hal_buffer_binding_table_t binding_table, |
| iree_hal_buffer_ref_t buffer_ref, iree_hal_buffer_ref_t* out_resolved_ref) { |
| if (buffer_ref.buffer) { |
| // Direct buffer reference. |
| *out_resolved_ref = buffer_ref; |
| return iree_ok_status(); |
| } else if (binding_table.count == 0) { |
| // NULL buffer reference. |
| memset(out_resolved_ref, 0, sizeof(*out_resolved_ref)); |
| return iree_ok_status(); |
| } else if (IREE_UNLIKELY(buffer_ref.buffer_slot >= binding_table.count)) { |
| // Out of bounds slot (validation should have caught). May be worth removing |
| // this case as this is a hot path. |
| // NOTE: this asserts that all incoming buffers must not be NULL. That may |
| // not be true. |
| return iree_make_status(IREE_STATUS_OUT_OF_RANGE, |
| "buffer binding %u out of range of binding table " |
| "with capacity %" PRIhsz, |
| buffer_ref.buffer_slot, binding_table.count); |
| } else { |
| // Indirect buffer reference - need to combine the final range based on |
| // the binding table range and the range of the reference. |
| const iree_hal_buffer_binding_t* binding = |
| &binding_table.bindings[buffer_ref.buffer_slot]; |
| out_resolved_ref->reserved = buffer_ref.reserved; |
| out_resolved_ref->buffer_slot = 0; |
| out_resolved_ref->buffer = binding->buffer; |
| return iree_hal_buffer_calculate_range( |
| binding->offset, binding->length, buffer_ref.offset, buffer_ref.length, |
| &out_resolved_ref->offset, &out_resolved_ref->length); |
| } |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // iree_hal_command_buffer_t |
| //===----------------------------------------------------------------------===// |
| |
| // Asynchronous command buffer recording interface. |
| // Commands are recorded by the implementation for later submission to device |
| // queues. |
| // |
| // Buffers, events, and programs referenced must remain valid and not be |
| // modified or read while there are commands in-flight. The usual flow is to |
| // populate input buffers, dispatch using those buffers, wait on a semaphore |
| // until the buffers are guaranteed to no longer be in use, and then reuse the |
| // buffers. Lifetimes are managed by the command buffer and all used resources |
| // will be retained for as long as the command buffer is live or until it is |
| // reset. |
| // |
| // Buffers referenced by a command buffer may be either direct (a concrete |
| // iree_hal_buffer_t reference) or indirect (a binding table slot ordinal). |
| // Direct buffer references are embedded in the command buffer and cannot be |
| // changed and the referenced resources will be kept live for as long as the |
| // command buffer is live. Indirect references are placeholders indicating that |
| // at the time the command buffer is submitted to a device queue a buffer will |
| // be provided allowing for the same command buffer to be reused with different |
| // buffers. Indirect command buffers are not concurrently schedulable unless |
| // specified as many implementations need per submission shadow resources. |
| // Validation of direct buffer references happens as the commands are recorded |
| // and further validation is not required. Indirect buffer references are |
| // validated upon submission with a populated binding table. |
| // |
| // Errors that can be recognized when operations are enqueued will be returned |
| // immediately, such as invalid argument errors. Errors that can only be |
| // determined at execution time will be returned on semaphores. Once a failure |
| // occurs the device queue will enter an error state that invalidates all |
| // operations on the device queue (as ordering is not strict and any may still |
| // be in-flight). In this case the user of the device queue should treat all |
| // in-flight operations as cancelled and fully reset themselves. Other device |
| // queues that may be waiting on events from the device queue will also enter |
| // error states. Only once a user has acknowledged and cleared the error state |
| // with a Reset the queue will become usable, and otherwise all operations will |
| // return errors. |
| // |
| // Command buffers are thread-compatible. Use multiple command buffers if trying |
| // to record commands from multiple threads. Command buffers must not be mutated |
| // between when they have are submitted for execution on a queue and when the |
| // semaphore fires indicating the completion of their execution. |
| typedef struct iree_hal_command_buffer_t iree_hal_command_buffer_t; |
| |
| // Creates a command buffer ready to begin recording, possibly reusing an |
| // existing one from the |device| pool. |
| // |
| // |binding_capacity| specifies the maximum number of indirect binding slots |
| // available for use by iree_hal_command_buffer_push_descriptor_set commands |
| // referencing the binding table. Must only be non-zero for command buffer modes |
| // supporting indirect bindings. |
| // |
| // |queue_affinity| specifies the device queues the command buffer may be |
| // submitted to. The queue affinity provided to iree_hal_device_queue_execute |
| // must match or be a subset of the |queue_affinity|. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_create( |
| iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode, |
| iree_hal_command_category_t command_categories, |
| iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, |
| iree_hal_command_buffer_t** out_command_buffer); |
| |
| // Retains the given |command_buffer| for the caller. |
| IREE_API_EXPORT void iree_hal_command_buffer_retain( |
| iree_hal_command_buffer_t* command_buffer); |
| |
| // Releases the given |command_buffer| from the caller. |
| IREE_API_EXPORT void iree_hal_command_buffer_release( |
| iree_hal_command_buffer_t* command_buffer); |
| |
| // Returns a bitmask indicating the behavior of the command buffer. |
| IREE_API_EXPORT iree_hal_command_buffer_mode_t |
| iree_hal_command_buffer_mode(const iree_hal_command_buffer_t* command_buffer); |
| |
| // Returns a bitmask indicating which command categories this command buffer |
| // can record. |
| IREE_API_EXPORT iree_hal_command_category_t |
| iree_hal_command_buffer_allowed_categories( |
| const iree_hal_command_buffer_t* command_buffer); |
| |
| // Begins recording into the command buffer. |
| // The command buffer must not have been recorded already; this is only valid to |
| // call once after creation and must be paired with iree_hal_command_buffer_end. |
| IREE_API_EXPORT iree_status_t |
| iree_hal_command_buffer_begin(iree_hal_command_buffer_t* command_buffer); |
| |
| // Ends recording into the command buffer. |
| // This must be called prior to submitting the command buffer for execution. |
| IREE_API_EXPORT iree_status_t |
| iree_hal_command_buffer_end(iree_hal_command_buffer_t* command_buffer); |
| |
| // Pushes a new debug group with the given |label|. |
| // All commands between this and a mandatory matching call to |
| // iree_hal_command_buffer_end_debug_group will be grouped together with the |
| // given label. If a source location is available it can be provided via |
| // |location| to allow mapping back into the source program that issued the |
| // commands. |
| // |
| // An optional RGBA color to show in the debug UI may be provided via |
| // |label_color|; otherwise iree_hal_label_color_unspecified can be used to let |
| // the debug tool choose. |
| IREE_API_EXPORT void iree_hal_command_buffer_begin_debug_group( |
| iree_hal_command_buffer_t* command_buffer, iree_string_view_t label, |
| iree_hal_label_color_t label_color, |
| const iree_hal_label_location_t* location); |
| |
| // Pops a debug group from the stack. |
| IREE_API_EXPORT void iree_hal_command_buffer_end_debug_group( |
| iree_hal_command_buffer_t* command_buffer); |
| |
| // Defines a memory dependency between commands recorded before and after the |
| // barrier. One or more memory or buffer barriers can be specified to indicate |
| // between which stages or buffers the dependencies exist. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_execution_barrier( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_execution_stage_t source_stage_mask, |
| iree_hal_execution_stage_t target_stage_mask, |
| iree_hal_execution_barrier_flags_t flags, |
| iree_host_size_t memory_barrier_count, |
| const iree_hal_memory_barrier_t* memory_barriers, |
| iree_host_size_t buffer_barrier_count, |
| const iree_hal_buffer_barrier_t* buffer_barriers); |
| |
| // Sets an event to the signaled state. |
| // |source_stage_mask| specifies when the event is signaled. |
| // |
| // Events are only valid within a single command buffer. Events can only be |
| // used on non-transfer queues. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_signal_event( |
| iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event, |
| iree_hal_execution_stage_t source_stage_mask); |
| |
| // Resets an event to the non-signaled state. |
| // |source_stage_mask| specifies when the event is unsignaled. |
| // |
| // Events are only valid within a single command buffer. Events can only be |
| // used on non-transfer queues. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_reset_event( |
| iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event, |
| iree_hal_execution_stage_t source_stage_mask); |
| |
| // Waits for one or more events to be signaled and defines a memory dependency |
| // between the synchronization scope of the signal operations and the commands |
| // following the wait. |
| // |
| // |source_stage_mask| must include ExecutionStage::kHost for Event::Signal to |
| // be visible. |
| // |
| // Events are only valid within a single command buffer. Events remain |
| // signaled even after waiting and must be reset to be reused. Events can only |
| // be used on non-transfer queues. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_wait_events( |
| iree_hal_command_buffer_t* command_buffer, iree_host_size_t event_count, |
| const iree_hal_event_t** events, |
| iree_hal_execution_stage_t source_stage_mask, |
| iree_hal_execution_stage_t target_stage_mask, |
| iree_host_size_t memory_barrier_count, |
| const iree_hal_memory_barrier_t* memory_barriers, |
| iree_host_size_t buffer_barrier_count, |
| const iree_hal_buffer_barrier_t* buffer_barriers); |
| |
| // Advises the device about the usage of the given buffer. |
| // The device may use this information to perform cache management or ignore it |
| // entirely. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_advise_buffer( |
| iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t buffer_ref, |
| iree_hal_memory_advise_flags_t flags, uint64_t arg0, uint64_t arg1); |
| |
| // Fills the target buffer with the given repeating value. |
| // Expects that |pattern_length| is one of 1, 2, or 4 and that the offset and |
| // length are aligned to the natural alignment of the value. |
| // The target buffer must be compatible with the devices owned by this |
| // device queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_fill_buffer( |
| iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t target_ref, |
| const void* pattern, iree_host_size_t pattern_length, |
| iree_hal_fill_flags_t flags); |
| |
| // Updates a range of the given target buffer from the source host memory. |
| // The source host memory is copied immediately into the command buffer and |
| // occupies command buffer space. It is strongly recommended that large buffer |
| // updates are performed via iree_hal_command_buffer_copy_buffer where there is |
| // the possibility of a zero-copy path. |
| // The |source_buffer| may be released by the caller immediately after this |
| // call returns. |
| // The |target_buffer| must be compatible with the devices owned by this |
| // device queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_update_buffer( |
| iree_hal_command_buffer_t* command_buffer, const void* source_buffer, |
| iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref, |
| iree_hal_update_flags_t flags); |
| |
| // Copies a range of one buffer to another. |
| // Both buffers must be compatible with the devices owned by this device |
| // queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER. Though the source |
| // and target buffer may be the same the ranges must not overlap (as with |
| // memcpy). |
| // |
| // This can be used to perform device->host, host->device, and device->device |
| // copies. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_copy_buffer( |
| iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_ref_t source_ref, |
| iree_hal_buffer_ref_t target_ref, iree_hal_copy_flags_t flags); |
| |
| // Dispatches a collective operation defined by |op| using the given buffers. |
| // |param| must be specified for operations that require a root/peer rank |
| // identifier and is otherwise ignored. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_collective( |
| iree_hal_command_buffer_t* command_buffer, iree_hal_channel_t* channel, |
| iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref, |
| iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count); |
| |
| // Dispatches an execution request. |
| // The request may execute overlapped with any other transfer operation or |
| // dispatch made within the same barrier-defined sequence. The executable |
| // specified must be registered for use with the device driver owning this |
| // queue. |
| // |
| // The provided constant data and binding list will be recorded into the command |
| // buffer and need not remain live beyond the call. |
| // |
| // Fails if the queue does not support dispatch operations or |
| // IREE_HAL_COMMAND_CATEGORY_DISPATCH was not set. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_dispatch( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_executable_t* executable, int32_t entry_point, |
| const uint32_t workgroup_count[3], iree_const_byte_span_t constants, |
| iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags); |
| |
| // Dispatches an execution request with a deferred workgroup count. |
| // This is the same as iree_hal_command_buffer_dispatch but the workgroup count |
| // is read from the given |workgroups_ref| buffer at the specified offset as |
| // 3 uint32_t XYZ values immediately before performing the dispatch. This allows |
| // prior dispatches within the command sequence to populate the workgroup |
| // count or the workgroup count to change across submissions of the same |
| // reusable command buffer. |
| // |
| // The buffer must have been allocated with |
| // IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS and be of |
| // IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_dispatch_indirect( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_executable_t* executable, int32_t entry_point, |
| iree_hal_buffer_ref_t workgroups_ref, iree_const_byte_span_t constants, |
| iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags); |
| |
| //===----------------------------------------------------------------------===// |
| // Validation support |
| //===----------------------------------------------------------------------===// |
| |
| // Validates that all bindings in the provided |binding_table| match the |
| // requirements of |command_buffer| as recorded. If the command buffer does not |
| // use any indirect bindings the table will be ignored. If more bindings than |
| // are used by the command buffer are provided they will be ignored. |
| IREE_API_EXPORT iree_status_t iree_hal_command_buffer_validate_submission( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_buffer_binding_table_t binding_table); |
| |
| //===----------------------------------------------------------------------===// |
| // Utilities for command buffer creation |
| //===----------------------------------------------------------------------===// |
| |
| // Defines a transfer command operation. |
| typedef enum iree_hal_transfer_command_type_t { |
| // iree_hal_command_buffer_fill_buffer |
| IREE_HAL_TRANSFER_COMMAND_TYPE_FILL = 0u, |
| // iree_hal_command_buffer_update_buffer |
| IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE = 1u, |
| // iree_hal_command_buffer_copy_buffer |
| IREE_HAL_TRANSFER_COMMAND_TYPE_COPY = 2u, |
| } iree_hal_transfer_command_type_t; |
| |
| // Represents a single transfer command within a batch of commands. |
| typedef struct iree_hal_transfer_command_t { |
| // The type of the command selecting which of the payload data is used. |
| iree_hal_transfer_command_type_t type; |
| union { |
| // IREE_HAL_TRANSFER_COMMAND_TYPE_FILL |
| struct { |
| iree_hal_buffer_t* target_buffer; |
| iree_device_size_t target_offset; |
| iree_device_size_t length; |
| const void* pattern; |
| iree_host_size_t pattern_length; |
| } fill; |
| // IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE |
| struct { |
| const void* source_buffer; |
| iree_host_size_t source_offset; |
| iree_hal_buffer_t* target_buffer; |
| iree_device_size_t target_offset; |
| iree_device_size_t length; |
| } update; |
| // IREE_HAL_TRANSFER_COMMAND_TYPE_COPY |
| struct { |
| iree_hal_buffer_t* source_buffer; |
| iree_device_size_t source_offset; |
| iree_hal_buffer_t* target_buffer; |
| iree_device_size_t target_offset; |
| iree_device_size_t length; |
| } copy; |
| }; |
| } iree_hal_transfer_command_t; |
| |
| // Builds a command buffer containing a recording of all |transfer_commands|. |
| // All buffers must be compatible with |device| and ranges must not overlap |
| // (same as with memcpy). All commands are executed concurrently with no |
| // barriers. The provided commands and any referenced data needs only remain |
| // live during recording, while all referenced buffers must be kept valid by |
| // the caller until the command buffer has completed execution. |
| // |
| // This is just a utility to make it easier to quickly construct batches of |
| // transfer operations. If more control is required then record the command |
| // buffer as normal. |
| IREE_API_EXPORT iree_status_t iree_hal_create_transfer_command_buffer( |
| iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode, |
| iree_hal_queue_affinity_t queue_affinity, iree_host_size_t transfer_count, |
| const iree_hal_transfer_command_t* transfer_commands, |
| iree_hal_command_buffer_t** out_command_buffer); |
| |
| //===----------------------------------------------------------------------===// |
| // iree_hal_command_buffer_t implementation details |
| //===----------------------------------------------------------------------===// |
| |
| typedef struct iree_hal_command_buffer_vtable_t { |
| void(IREE_API_PTR* destroy)(iree_hal_command_buffer_t* command_buffer); |
| |
| iree_status_t(IREE_API_PTR* begin)(iree_hal_command_buffer_t* command_buffer); |
| iree_status_t(IREE_API_PTR* end)(iree_hal_command_buffer_t* command_buffer); |
| |
| void(IREE_API_PTR* begin_debug_group)( |
| iree_hal_command_buffer_t* command_buffer, iree_string_view_t label, |
| iree_hal_label_color_t label_color, |
| const iree_hal_label_location_t* location); |
| void(IREE_API_PTR* end_debug_group)( |
| iree_hal_command_buffer_t* command_buffer); |
| |
| iree_status_t(IREE_API_PTR* execution_barrier)( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_execution_stage_t source_stage_mask, |
| iree_hal_execution_stage_t target_stage_mask, |
| iree_hal_execution_barrier_flags_t flags, |
| iree_host_size_t memory_barrier_count, |
| const iree_hal_memory_barrier_t* memory_barriers, |
| iree_host_size_t buffer_barrier_count, |
| const iree_hal_buffer_barrier_t* buffer_barriers); |
| |
| iree_status_t(IREE_API_PTR* signal_event)( |
| iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event, |
| iree_hal_execution_stage_t source_stage_mask); |
| |
| iree_status_t(IREE_API_PTR* reset_event)( |
| iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event, |
| iree_hal_execution_stage_t source_stage_mask); |
| |
| iree_status_t(IREE_API_PTR* wait_events)( |
| iree_hal_command_buffer_t* command_buffer, iree_host_size_t event_count, |
| const iree_hal_event_t** events, |
| iree_hal_execution_stage_t source_stage_mask, |
| iree_hal_execution_stage_t target_stage_mask, |
| iree_host_size_t memory_barrier_count, |
| const iree_hal_memory_barrier_t* memory_barriers, |
| iree_host_size_t buffer_barrier_count, |
| const iree_hal_buffer_barrier_t* buffer_barriers); |
| |
| iree_status_t(IREE_API_PTR* advise_buffer)( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_buffer_ref_t buffer_ref, iree_hal_memory_advise_flags_t flags, |
| uint64_t arg0, uint64_t arg1); |
| |
| iree_status_t(IREE_API_PTR* fill_buffer)( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_buffer_ref_t target_ref, const void* pattern, |
| iree_host_size_t pattern_length, iree_hal_fill_flags_t flags); |
| |
| iree_status_t(IREE_API_PTR* update_buffer)( |
| iree_hal_command_buffer_t* command_buffer, const void* source_buffer, |
| iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref, |
| iree_hal_update_flags_t flags); |
| |
| iree_status_t(IREE_API_PTR* copy_buffer)( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref, |
| iree_hal_copy_flags_t flags); |
| |
| iree_status_t(IREE_API_PTR* collective)( |
| iree_hal_command_buffer_t* command_buffer, iree_hal_channel_t* channel, |
| iree_hal_collective_op_t op, uint32_t param, |
| iree_hal_buffer_ref_t send_ref, iree_hal_buffer_ref_t recv_ref, |
| iree_device_size_t element_count); |
| |
| iree_status_t(IREE_API_PTR* dispatch)( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_executable_t* executable, int32_t entry_point, |
| const uint32_t workgroup_count[3], iree_const_byte_span_t constants, |
| iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags); |
| |
| iree_status_t(IREE_API_PTR* dispatch_indirect)( |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_executable_t* executable, int32_t entry_point, |
| iree_hal_buffer_ref_t workgroups_ref, iree_const_byte_span_t constants, |
| iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags); |
| } iree_hal_command_buffer_vtable_t; |
| IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_command_buffer_vtable_t); |
| |
| struct iree_hal_command_buffer_t { |
| iree_hal_resource_t resource; |
| iree_hal_command_buffer_mode_t mode; |
| iree_hal_command_category_t allowed_categories; |
| iree_hal_queue_affinity_t queue_affinity; |
| uint32_t binding_capacity; |
| uint32_t binding_count; |
| void* validation_state; |
| }; |
| |
| // Returns the total size of the additional command buffer storage required for |
| // validating the command buffer. Returns 0 if no validation state is required. |
| IREE_API_EXPORT iree_host_size_t iree_hal_command_buffer_validation_state_size( |
| iree_hal_command_buffer_mode_t mode, iree_host_size_t binding_capacity); |
| |
| IREE_API_EXPORT void iree_hal_command_buffer_initialize( |
| iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, |
| iree_hal_command_category_t command_categories, |
| iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, |
| void* validation_state, const iree_hal_command_buffer_vtable_t* vtable, |
| iree_hal_command_buffer_t* command_buffer); |
| |
| IREE_API_EXPORT void iree_hal_command_buffer_destroy( |
| iree_hal_command_buffer_t* command_buffer); |
| |
| #ifdef __cplusplus |
| } // extern "C" |
| #endif // __cplusplus |
| |
| #endif // IREE_HAL_COMMAND_BUFFER_H_ |