blob: 6d5b12f0d90bdebd85654817491b1b9dc42aac43 [file]
// Copyright 2025 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/hal/drivers/amdgpu/device/blit.h"
//===----------------------------------------------------------------------===//
// Buffer transfer operation utilities
//===----------------------------------------------------------------------===//
// Reserves the next packet in the queue and returns its packet_id.
// If tracing is enabled |out_completion_signal| will be populated with the
// signal that must be attached to the operation.
static uint64_t iree_hal_amdgpu_device_blit_reserve(
const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
context,
iree_hal_amdgpu_trace_execution_zone_type_t zone_type,
iree_hsa_signal_t* IREE_AMDGPU_RESTRICT out_completion_signal) {
#if IREE_HAL_AMDGPU_HAS_TRACING_FEATURE( \
IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION)
if (context->trace_buffer) {
iree_hal_amdgpu_trace_execution_query_id_t execution_query_id =
iree_hal_amdgpu_device_query_ringbuffer_acquire(
&context->trace_buffer->query_ringbuffer);
*out_completion_signal =
iree_hal_amdgpu_device_trace_execution_zone_dispatch(
context->trace_buffer, zone_type, 0, execution_query_id);
} else {
*out_completion_signal = iree_hsa_signal_null();
}
#endif // IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION
// Reserve the next packet in the queue.
const uint64_t packet_id = iree_hsa_queue_add_write_index(
&context->queue, 1u, iree_amdgpu_memory_order_relaxed);
while (packet_id - iree_hsa_queue_load_read_index(
&context->queue, iree_amdgpu_memory_order_acquire) >=
context->queue.size) {
iree_amdgpu_yield(); // spinning
}
return packet_id;
}
// Commits a reserved transfer packet.
// The header will be updated and the target queue doorbell will be signaled.
static void iree_hal_amdgpu_device_blit_commit(
const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
context,
uint64_t packet_id,
iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT packet,
iree_hsa_signal_t completion_signal) {
// Chain completion.
packet->completion_signal = completion_signal;
// Populate the header and release the packet to the queue.
uint16_t header = IREE_HSA_PACKET_TYPE_KERNEL_DISPATCH
<< IREE_HSA_PACKET_HEADER_TYPE;
// TODO(benvanik): need to pull in barrier/scope overrides from command buffer
// execution context flags. They should override the barrier bit and the
// scopes to be on SYSTEM regardless of what we choose here.
// NOTE: we don't need a barrier bit as the caller is expecting it to run
// concurrently if needed.
header |= 0 << IREE_HSA_PACKET_HEADER_BARRIER;
#if IREE_HAL_AMDGPU_HAS_TRACING_FEATURE( \
IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION)
if (context->trace_buffer) {
// Force a barrier bit if we are tracing execution. This ensures that we get
// exclusive timing for the operation.
header |= 1 << IREE_HSA_PACKET_HEADER_BARRIER;
}
#endif // IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION
// TODO(benvanik): scope to agent if the pointer is local, or maybe none in
// cases where surrounding barriers performed the cache management.
header |= IREE_HSA_FENCE_SCOPE_SYSTEM
<< IREE_HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE;
header |= IREE_HSA_FENCE_SCOPE_SYSTEM
<< IREE_HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE;
const uint32_t header_setup = header | (uint32_t)(packet->setup << 16);
iree_amdgpu_scoped_atomic_store(
(iree_amdgpu_scoped_atomic_uint32_t*)packet, header_setup,
iree_amdgpu_memory_order_release, iree_amdgpu_memory_scope_system);
// Signal the queue doorbell indicating the packet has been updated.
iree_hsa_signal_store(context->queue.doorbell_signal, packet_id,
iree_amdgpu_memory_order_relaxed);
}
//===----------------------------------------------------------------------===//
// Blit kernel utilities
//===----------------------------------------------------------------------===//
// 2 uint64_t values totaling 16 bytes.
typedef uint32_t iree_amdgpu_uint64x2_t __attribute__((vector_size(16)));
static inline size_t iree_hal_amdgpu_blit_linear_id(void) {
const size_t id_x = iree_hal_amdgpu_device_group_id_x() *
IREE_HAL_AMDGPU_BLIT_WORKGROUP_SIZE_X +
iree_hal_amdgpu_device_local_id_x();
const size_t id_y = iree_hal_amdgpu_device_group_id_y() *
IREE_HAL_AMDGPU_BLIT_WORKGROUP_SIZE_Y +
iree_hal_amdgpu_device_local_id_y();
return id_y * iree_amdgcn_dispatch_ptr()->grid_size[0] + id_x;
}
//===----------------------------------------------------------------------===//
// iree_hal_amdgpu_device_buffer_fill_*
//===----------------------------------------------------------------------===//
IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x1(
uint8_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
const uint8_t pattern) {
const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
// Slowest possible copy; benchmarks required to iterate on better impls.
target_ptr[element_offset] = pattern;
}
}
IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x2(
uint16_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
const uint16_t pattern) {
const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
// Slowest possible fill; benchmarks required to iterate on better impls.
target_ptr[element_offset] = pattern;
}
}
IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x4(
uint32_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
const uint32_t pattern) {
const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
// Slowest possible fill; benchmarks required to iterate on better impls.
target_ptr[element_offset] = pattern;
}
}
IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x8(
uint64_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
const uint64_t pattern) {
const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
// Slowest possible fill; benchmarks required to iterate on better impls.
target_ptr[element_offset] = pattern;
}
}
#define IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE sizeof(iree_amdgpu_uint64x2_t)
#define IREE_HAL_AMDGPU_FILL_BLOCK_COUNT 8
#define IREE_HAL_AMDGPU_FILL_BLOCK_SIZE \
(IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE * IREE_HAL_AMDGPU_FILL_BLOCK_COUNT)
// Fills a block of up to IREE_HAL_AMDGPU_FILL_BLOCK_COUNT 16-byte elements with
// a fixed pattern. Requires an alignment of 16-bytes on both the |target_ptr|
// and |length|.
IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_block_x16(
iree_amdgpu_uint64x2_t* IREE_AMDGPU_RESTRICT target_ptr,
const uint64_t element_length, const uint64_t pattern) {
const size_t block_id = iree_hal_amdgpu_blit_linear_id();
const size_t element_offset = block_id * IREE_HAL_AMDGPU_FILL_BLOCK_COUNT;
if (IREE_AMDGPU_UNLIKELY(element_offset >= element_length)) return;
iree_amdgpu_uint64x2_t pattern_x16 = {pattern, pattern};
const size_t element_count =
IREE_AMDGPU_MIN(IREE_HAL_AMDGPU_FILL_BLOCK_COUNT,
element_length - element_offset) /
sizeof(pattern_x16);
if (IREE_AMDGPU_LIKELY(element_count == IREE_HAL_AMDGPU_FILL_BLOCK_COUNT)) {
#pragma unroll
for (int i = 0; i < IREE_HAL_AMDGPU_FILL_BLOCK_COUNT; ++i) {
target_ptr[element_offset + i] = pattern_x16;
}
} else {
for (int i = 0; i < element_count; ++i) {
target_ptr[element_offset + i] = pattern_x16;
}
}
}
// Returns the bytes of |pattern| of length |pattern_length| splatted to
// an 8-byte value.
static uint64_t iree_hal_amdgpu_device_extend_pattern_x8(
const uint64_t pattern, const uint8_t pattern_length) {
switch (pattern_length) {
case 1:
return ((uint64_t)pattern << 56) | ((uint64_t)pattern << 48) |
((uint64_t)pattern << 40) | ((uint64_t)pattern << 32) |
((uint64_t)pattern << 24) | ((uint64_t)pattern << 16) |
((uint64_t)pattern << 8) | pattern;
case 2:
return ((uint64_t)pattern << 48) | ((uint64_t)pattern << 32) |
((uint64_t)pattern << 16) | pattern;
case 4:
return ((uint64_t)pattern << 32) | pattern;
case 8:
return pattern;
default:
return 0;
}
}
iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT
iree_hal_amdgpu_device_buffer_fill_emplace_reserve(
const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
context,
void* target_ptr, const uint64_t length, uint64_t pattern,
const uint8_t pattern_length, uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr,
const uint64_t packet_id) {
IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);
// Select the kernel for the fill operation.
const iree_hal_amdgpu_device_kernel_args_t* IREE_AMDGPU_RESTRICT kernel_args =
NULL;
size_t element_size = 1;
size_t block_size = 1;
if (iree_amdgpu_has_alignment((size_t)target_ptr,
IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE) &&
iree_amdgpu_has_alignment(length,
IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE)) {
IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_block_x16");
pattern = iree_hal_amdgpu_device_extend_pattern_x8(pattern, pattern_length);
kernel_args =
&context->kernels->iree_hal_amdgpu_device_buffer_fill_block_x16;
element_size = IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE;
block_size = IREE_HAL_AMDGPU_FILL_BLOCK_COUNT;
} else {
switch (pattern_length) {
case 1:
IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x1");
kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x1;
break;
case 2:
IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x2");
kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x2;
break;
case 4:
IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x4");
kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x4;
break;
case 8:
IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x8");
kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x8;
break;
}
element_size = pattern_length;
block_size = 1;
}
// Update kernargs (same API for all kernels).
const size_t element_count = length / element_size;
iree_hal_amdgpu_device_buffer_fill_kernargs_t* kernargs =
(iree_hal_amdgpu_device_buffer_fill_kernargs_t*)kernarg_ptr;
kernargs->target_ptr = target_ptr;
kernargs->element_length = element_count;
kernargs->pattern = pattern;
// To support fills with more than UINT_MAX elements (uint32_t grid_size)
// we divide the problem into chunks as needed. We keep the innermost chunk
// size small as if we do [X,Y,1] we're likely to overshoot and don't want to
// have too many wasted invocations.
const size_t block_count = IREE_AMDGPU_CEIL_DIV(element_count, block_size);
uint32_t grid_size_x = 1;
uint32_t grid_size_y = 1;
if (IREE_AMDGPU_LIKELY(block_count <= 0xFFFFFFFFu)) {
grid_size_x = (uint32_t)block_count;
} else {
grid_size_x = 256;
grid_size_y = (uint32_t)IREE_AMDGPU_CEIL_DIV(block_count, grid_size_x);
}
// Populate the packet.
const uint64_t queue_mask = context->queue.size - 1; // power of two
iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT dispatch_packet =
context->queue.base_address + (packet_id & queue_mask) * 64;
dispatch_packet->setup = kernel_args->setup;
dispatch_packet->workgroup_size[0] = kernel_args->workgroup_size[0];
dispatch_packet->workgroup_size[1] = kernel_args->workgroup_size[1];
dispatch_packet->workgroup_size[2] = kernel_args->workgroup_size[2];
dispatch_packet->reserved0 = 0;
dispatch_packet->grid_size[0] = grid_size_x;
dispatch_packet->grid_size[1] = grid_size_y;
dispatch_packet->grid_size[2] = 1;
dispatch_packet->private_segment_size = kernel_args->private_segment_size;
dispatch_packet->group_segment_size = kernel_args->group_segment_size;
dispatch_packet->kernel_object = kernel_args->kernel_object;
dispatch_packet->kernarg_address = kernarg_ptr;
dispatch_packet->reserved2 = 0;
IREE_AMDGPU_TRACE_ZONE_END(z0);
return dispatch_packet;
}
void iree_hal_amdgpu_device_buffer_fill_enqueue(
const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
context,
void* target_ptr, const uint64_t length, const uint64_t pattern,
const uint8_t pattern_length, uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr) {
IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);
IREE_AMDGPU_TRACE_ZONE_APPEND_VALUE_I64(z0, length);
// Reserve and begin populating the operation packet.
// When tracing is enabled capture the timing signal.
iree_hsa_signal_t completion_signal = iree_hsa_signal_null();
const uint64_t packet_id = iree_hal_amdgpu_device_blit_reserve(
context, IREE_HAL_AMDGPU_TRACE_EXECUTION_ZONE_TYPE_FILL,
&completion_signal);
// Emplace the dispatch packet into the queue.
// Note that until the packet is issued the queue will stall.
iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT packet =
iree_hal_amdgpu_device_buffer_fill_emplace_reserve(
context, target_ptr, length, pattern, pattern_length, kernarg_ptr,
packet_id);
// Issues the buffer operation packet by configuring its header and signaling
// the queue doorbell.
iree_hal_amdgpu_device_blit_commit(context, packet_id, packet,
completion_signal);
IREE_AMDGPU_TRACE_ZONE_END(z0);
}
//===----------------------------------------------------------------------===//
// iree_hal_amdgpu_device_buffer_copy_*
//===----------------------------------------------------------------------===//
IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_copy_x1(
const uint8_t* IREE_AMDGPU_RESTRICT source_ptr,
uint8_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length) {
const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
// Slowest possible copy; benchmarks required to iterate on better impls.
target_ptr[element_offset] = source_ptr[element_offset];
}
}
#define IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE sizeof(iree_amdgpu_uint64x2_t)
#define IREE_HAL_AMDGPU_COPY_BLOCK_COUNT 8
#define IREE_HAL_AMDGPU_COPY_BLOCK_SIZE \
(IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE * IREE_HAL_AMDGPU_COPY_BLOCK_COUNT)
// Copies a block of up to IREE_HAL_AMDGPU_COPY_BLOCK_COUNT 16-byte elements
// from |source_ptr| to |target_ptr|. Requires an alignment of 16-bytes on all
// of |source_ptr|, |target_ptr|, and |length|.
//
// Dispatched on a 2D grid with up to UINT32_MAX blocks on X.
IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_copy_block_x16(
const iree_amdgpu_uint64x2_t* IREE_AMDGPU_RESTRICT source_ptr,
iree_amdgpu_uint64x2_t* IREE_AMDGPU_RESTRICT target_ptr,
const uint64_t element_length) {
const size_t block_id = iree_hal_amdgpu_blit_linear_id();
const size_t element_offset = block_id * IREE_HAL_AMDGPU_COPY_BLOCK_COUNT;
if (IREE_AMDGPU_UNLIKELY(element_offset >= element_length)) return;
const size_t element_count = IREE_AMDGPU_MIN(IREE_HAL_AMDGPU_COPY_BLOCK_COUNT,
element_length - element_offset);
if (IREE_AMDGPU_LIKELY(element_count == IREE_HAL_AMDGPU_COPY_BLOCK_COUNT)) {
#pragma unroll
for (int i = 0; i < IREE_HAL_AMDGPU_COPY_BLOCK_COUNT; ++i) {
target_ptr[element_offset + i] = source_ptr[element_offset + i];
}
} else {
for (int i = 0; i < element_count; ++i) {
target_ptr[element_offset + i] = source_ptr[element_offset + i];
}
}
}
// TODO(benvanik): experiment with enqueuing SDMA somehow (may need to take a
// DMA queue as well as the dispatch queue). Note that on some configurations
// (InfinityFabric) blit kernels can be 2x faster than SDMA.
iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT
iree_hal_amdgpu_device_buffer_copy_emplace_reserve(
const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
context,
const void* source_ptr, void* target_ptr, const uint64_t length,
uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr, const uint64_t packet_id) {
IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);
// Select the kernel for the copy operation.
// TODO(benvanik): switch kernel based on source/target/length alignment.
const iree_hal_amdgpu_device_kernel_args_t* IREE_AMDGPU_RESTRICT kernel_args =
NULL;
size_t element_size = 1;
size_t block_size = 1;
if (iree_amdgpu_has_alignment((size_t)source_ptr,
IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE) &&
iree_amdgpu_has_alignment((size_t)target_ptr,
IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE) &&
iree_amdgpu_has_alignment(length,
IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE)) {
IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "copy_block_x16");
kernel_args =
&context->kernels->iree_hal_amdgpu_device_buffer_copy_block_x16;
element_size = IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE;
block_size = IREE_HAL_AMDGPU_COPY_BLOCK_COUNT;
} else {
IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "copy_x1");
kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_copy_x1;
element_size = 1;
block_size = 1;
}
// Update kernargs (same API for all kernels).
const size_t element_count = length / element_size;
iree_hal_amdgpu_device_buffer_copy_kernargs_t* kernargs =
(iree_hal_amdgpu_device_buffer_copy_kernargs_t*)kernarg_ptr;
kernargs->source_ptr = source_ptr;
kernargs->target_ptr = target_ptr;
kernargs->element_length = element_count;
// To support transfers with more than UINT_MAX elements (uint32_t grid_size)
// we divide the problem into chunks as needed. We keep the innermost chunk
// size small as if we do [X,Y,1] we're likely to overshoot and don't want to
// have too many wasted invocations.
const size_t block_count = IREE_AMDGPU_CEIL_DIV(element_count, block_size);
uint32_t grid_size_x = 1;
uint32_t grid_size_y = 1;
if (IREE_AMDGPU_LIKELY(block_count <= 0xFFFFFFFFu)) {
grid_size_x = (uint32_t)block_count;
} else {
grid_size_x = 256;
grid_size_y = (uint32_t)IREE_AMDGPU_CEIL_DIV(block_count, grid_size_x);
}
// Populate the packet.
const uint64_t queue_mask = context->queue.size - 1; // power of two
iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT dispatch_packet =
context->queue.base_address + (packet_id & queue_mask) * 64;
dispatch_packet->setup = kernel_args->setup;
dispatch_packet->workgroup_size[0] = kernel_args->workgroup_size[0];
dispatch_packet->workgroup_size[1] = kernel_args->workgroup_size[1];
dispatch_packet->workgroup_size[2] = kernel_args->workgroup_size[2];
dispatch_packet->reserved0 = 0;
dispatch_packet->grid_size[0] = grid_size_x;
dispatch_packet->grid_size[1] = grid_size_y;
dispatch_packet->grid_size[2] = 1;
dispatch_packet->private_segment_size = kernel_args->private_segment_size;
dispatch_packet->group_segment_size = kernel_args->group_segment_size;
dispatch_packet->kernel_object = kernel_args->kernel_object;
dispatch_packet->kernarg_address = kernarg_ptr;
dispatch_packet->reserved2 = 0;
IREE_AMDGPU_TRACE_ZONE_END(z0);
return dispatch_packet;
}
void iree_hal_amdgpu_device_buffer_copy_enqueue(
const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
context,
const void* source_ptr, void* target_ptr, const uint64_t length,
uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr) {
IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);
IREE_AMDGPU_TRACE_ZONE_APPEND_VALUE_I64(z0, length);
// Reserve and begin populating the operation packet.
// When tracing is enabled capture the timing signal.
iree_hsa_signal_t completion_signal = iree_hsa_signal_null();
const uint64_t packet_id = iree_hal_amdgpu_device_blit_reserve(
context, IREE_HAL_AMDGPU_TRACE_EXECUTION_ZONE_TYPE_COPY,
&completion_signal);
// Emplace the dispatch packet into the queue.
// Note that until the packet is issued the queue will stall.
iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT packet =
iree_hal_amdgpu_device_buffer_copy_emplace_reserve(
context, source_ptr, target_ptr, length, kernarg_ptr, packet_id);
// Issues the buffer operation packet by configuring its header and signaling
// the queue doorbell.
iree_hal_amdgpu_device_blit_commit(context, packet_id, packet,
completion_signal);
IREE_AMDGPU_TRACE_ZONE_END(z0);
}