runtime/src/iree/hal/drivers/amdgpu/device/blit.c - 3p/openxla/iree - Git at Google

 // Copyright 2025 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/hal/drivers/amdgpu/device/blit.h"

 //===----------------------------------------------------------------------===//
 // Buffer transfer operation utilities
 //===----------------------------------------------------------------------===//

 // Reserves the next packet in the queue and returns its packet_id.
 // If tracing is enabled |out_completion_signal| will be populated with the
 // signal that must be attached to the operation.
 static uint64_t iree_hal_amdgpu_device_blit_reserve(
     const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
         context,
     iree_hal_amdgpu_trace_execution_zone_type_t zone_type,
     iree_hsa_signal_t* IREE_AMDGPU_RESTRICT out_completion_signal) {
 #if IREE_HAL_AMDGPU_HAS_TRACING_FEATURE( \
     IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION)
   if (context->trace_buffer) {
     iree_hal_amdgpu_trace_execution_query_id_t execution_query_id =
         iree_hal_amdgpu_device_query_ringbuffer_acquire(
             &context->trace_buffer->query_ringbuffer);
     *out_completion_signal =
         iree_hal_amdgpu_device_trace_execution_zone_dispatch(
             context->trace_buffer, zone_type, 0, execution_query_id);
   } else {
     *out_completion_signal = iree_hsa_signal_null();
   }
 #endif  // IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION

   // Reserve the next packet in the queue.
   const uint64_t packet_id = iree_hsa_queue_add_write_index(
       &context->queue, 1u, iree_amdgpu_memory_order_relaxed);
   while (packet_id - iree_hsa_queue_load_read_index(
                          &context->queue, iree_amdgpu_memory_order_acquire) >=
          context->queue.size) {
     iree_amdgpu_yield();  // spinning
   }

   return packet_id;
 }

 // Commits a reserved transfer packet.
 // The header will be updated and the target queue doorbell will be signaled.
 static void iree_hal_amdgpu_device_blit_commit(
     const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
         context,
     uint64_t packet_id,
     iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT packet,
     iree_hsa_signal_t completion_signal) {
   // Chain completion.
   packet->completion_signal = completion_signal;

   // Populate the header and release the packet to the queue.
   uint16_t header = IREE_HSA_PACKET_TYPE_KERNEL_DISPATCH
                     << IREE_HSA_PACKET_HEADER_TYPE;

   // TODO(benvanik): need to pull in barrier/scope overrides from command buffer
   // execution context flags. They should override the barrier bit and the
   // scopes to be on SYSTEM regardless of what we choose here.

   // NOTE: we don't need a barrier bit as the caller is expecting it to run
   // concurrently if needed.
   header |= 0 << IREE_HSA_PACKET_HEADER_BARRIER;

 #if IREE_HAL_AMDGPU_HAS_TRACING_FEATURE( \
     IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION)
   if (context->trace_buffer) {
     // Force a barrier bit if we are tracing execution. This ensures that we get
     // exclusive timing for the operation.
     header |= 1 << IREE_HSA_PACKET_HEADER_BARRIER;
   }
 #endif  // IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION

   // TODO(benvanik): scope to agent if the pointer is local, or maybe none in
   // cases where surrounding barriers performed the cache management.
   header |= IREE_HSA_FENCE_SCOPE_SYSTEM
             << IREE_HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE;
   header |= IREE_HSA_FENCE_SCOPE_SYSTEM
             << IREE_HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE;

   const uint32_t header_setup = header | (uint32_t)(packet->setup << 16);
   iree_amdgpu_scoped_atomic_store(
       (iree_amdgpu_scoped_atomic_uint32_t*)packet, header_setup,
       iree_amdgpu_memory_order_release, iree_amdgpu_memory_scope_system);

   // Signal the queue doorbell indicating the packet has been updated.
   iree_hsa_signal_store(context->queue.doorbell_signal, packet_id,
                         iree_amdgpu_memory_order_relaxed);
 }

 //===----------------------------------------------------------------------===//
 // Blit kernel utilities
 //===----------------------------------------------------------------------===//

 // 2 uint64_t values totaling 16 bytes.
 typedef uint32_t iree_amdgpu_uint64x2_t __attribute__((vector_size(16)));

 static inline size_t iree_hal_amdgpu_blit_linear_id(void) {
   const size_t id_x = iree_hal_amdgpu_device_group_id_x() *
                           IREE_HAL_AMDGPU_BLIT_WORKGROUP_SIZE_X +
                       iree_hal_amdgpu_device_local_id_x();
   const size_t id_y = iree_hal_amdgpu_device_group_id_y() *
                           IREE_HAL_AMDGPU_BLIT_WORKGROUP_SIZE_Y +
                       iree_hal_amdgpu_device_local_id_y();
   return id_y * iree_amdgcn_dispatch_ptr()->grid_size[0] + id_x;
 }

 //===----------------------------------------------------------------------===//
 // iree_hal_amdgpu_device_buffer_fill_*
 //===----------------------------------------------------------------------===//

 IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x1(
     uint8_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
     const uint8_t pattern) {
   const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
   if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
     // Slowest possible copy; benchmarks required to iterate on better impls.
     target_ptr[element_offset] = pattern;
   }
 }

 IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x2(
     uint16_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
     const uint16_t pattern) {
   const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
   if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
     // Slowest possible fill; benchmarks required to iterate on better impls.
     target_ptr[element_offset] = pattern;
   }
 }

 IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x4(
     uint32_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
     const uint32_t pattern) {
   const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
   if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
     // Slowest possible fill; benchmarks required to iterate on better impls.
     target_ptr[element_offset] = pattern;
   }
 }

 IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x8(
     uint64_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
     const uint64_t pattern) {
   const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
   if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
     // Slowest possible fill; benchmarks required to iterate on better impls.
     target_ptr[element_offset] = pattern;
   }
 }

 #define IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE sizeof(iree_amdgpu_uint64x2_t)
 #define IREE_HAL_AMDGPU_FILL_BLOCK_COUNT 8
 #define IREE_HAL_AMDGPU_FILL_BLOCK_SIZE \
   (IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE * IREE_HAL_AMDGPU_FILL_BLOCK_COUNT)

 // Fills a block of up to IREE_HAL_AMDGPU_FILL_BLOCK_COUNT 16-byte elements with
 // a fixed pattern. Requires an alignment of 16-bytes on both the |target_ptr|
 // and |length|.
 IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_block_x16(
     iree_amdgpu_uint64x2_t* IREE_AMDGPU_RESTRICT target_ptr,
     const uint64_t element_length, const uint64_t pattern) {
   const size_t block_id = iree_hal_amdgpu_blit_linear_id();
   const size_t element_offset = block_id * IREE_HAL_AMDGPU_FILL_BLOCK_COUNT;
   if (IREE_AMDGPU_UNLIKELY(element_offset >= element_length)) return;
   iree_amdgpu_uint64x2_t pattern_x16 = {pattern, pattern};
   const size_t element_count =
       IREE_AMDGPU_MIN(IREE_HAL_AMDGPU_FILL_BLOCK_COUNT,
                       element_length - element_offset) /
       sizeof(pattern_x16);
   if (IREE_AMDGPU_LIKELY(element_count == IREE_HAL_AMDGPU_FILL_BLOCK_COUNT)) {
 #pragma unroll
     for (int i = 0; i < IREE_HAL_AMDGPU_FILL_BLOCK_COUNT; ++i) {
       target_ptr[element_offset + i] = pattern_x16;
     }
   } else {
     for (int i = 0; i < element_count; ++i) {
       target_ptr[element_offset + i] = pattern_x16;
     }
   }
 }

 // Returns the bytes of |pattern| of length |pattern_length| splatted to
 // an 8-byte value.
 static uint64_t iree_hal_amdgpu_device_extend_pattern_x8(
     const uint64_t pattern, const uint8_t pattern_length) {
   switch (pattern_length) {
     case 1:
       return ((uint64_t)pattern << 56) | ((uint64_t)pattern << 48) |
              ((uint64_t)pattern << 40) | ((uint64_t)pattern << 32) |
              ((uint64_t)pattern << 24) | ((uint64_t)pattern << 16) |
              ((uint64_t)pattern << 8) | pattern;
     case 2:
       return ((uint64_t)pattern << 48) | ((uint64_t)pattern << 32) |
              ((uint64_t)pattern << 16) | pattern;
     case 4:
       return ((uint64_t)pattern << 32) | pattern;
     case 8:
       return pattern;
     default:
       return 0;
   }
 }

 iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT
 iree_hal_amdgpu_device_buffer_fill_emplace_reserve(
     const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
         context,
     void* target_ptr, const uint64_t length, uint64_t pattern,
     const uint8_t pattern_length, uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr,
     const uint64_t packet_id) {
   IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
   IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);

   // Select the kernel for the fill operation.
   const iree_hal_amdgpu_device_kernel_args_t* IREE_AMDGPU_RESTRICT kernel_args =
       NULL;
   size_t element_size = 1;
   size_t block_size = 1;
   if (iree_amdgpu_has_alignment((size_t)target_ptr,
                                 IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE) &&
       iree_amdgpu_has_alignment(length,
                                 IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE)) {
     IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_block_x16");
     pattern = iree_hal_amdgpu_device_extend_pattern_x8(pattern, pattern_length);
     kernel_args =
         &context->kernels->iree_hal_amdgpu_device_buffer_fill_block_x16;
     element_size = IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE;
     block_size = IREE_HAL_AMDGPU_FILL_BLOCK_COUNT;
   } else {
     switch (pattern_length) {
       case 1:
         IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x1");
         kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x1;
         break;
       case 2:
         IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x2");
         kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x2;
         break;
       case 4:
         IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x4");
         kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x4;
         break;
       case 8:
         IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x8");
         kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x8;
         break;
     }
     element_size = pattern_length;
     block_size = 1;
   }

   // Update kernargs (same API for all kernels).
   const size_t element_count = length / element_size;
   iree_hal_amdgpu_device_buffer_fill_kernargs_t* kernargs =
       (iree_hal_amdgpu_device_buffer_fill_kernargs_t*)kernarg_ptr;
   kernargs->target_ptr = target_ptr;
   kernargs->element_length = element_count;
   kernargs->pattern = pattern;

   // To support fills with more than UINT_MAX elements (uint32_t grid_size)
   // we divide the problem into chunks as needed. We keep the innermost chunk
   // size small as if we do [X,Y,1] we're likely to overshoot and don't want to
   // have too many wasted invocations.
   const size_t block_count = IREE_AMDGPU_CEIL_DIV(element_count, block_size);
   uint32_t grid_size_x = 1;
   uint32_t grid_size_y = 1;
   if (IREE_AMDGPU_LIKELY(block_count <= 0xFFFFFFFFu)) {
     grid_size_x = (uint32_t)block_count;
   } else {
     grid_size_x = 256;
     grid_size_y = (uint32_t)IREE_AMDGPU_CEIL_DIV(block_count, grid_size_x);
   }

   // Populate the packet.
   const uint64_t queue_mask = context->queue.size - 1;  // power of two
   iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT dispatch_packet =
       context->queue.base_address + (packet_id & queue_mask) * 64;
   dispatch_packet->setup = kernel_args->setup;
   dispatch_packet->workgroup_size[0] = kernel_args->workgroup_size[0];
   dispatch_packet->workgroup_size[1] = kernel_args->workgroup_size[1];
   dispatch_packet->workgroup_size[2] = kernel_args->workgroup_size[2];
   dispatch_packet->reserved0 = 0;
   dispatch_packet->grid_size[0] = grid_size_x;
   dispatch_packet->grid_size[1] = grid_size_y;
   dispatch_packet->grid_size[2] = 1;
   dispatch_packet->private_segment_size = kernel_args->private_segment_size;
   dispatch_packet->group_segment_size = kernel_args->group_segment_size;
   dispatch_packet->kernel_object = kernel_args->kernel_object;
   dispatch_packet->kernarg_address = kernarg_ptr;
   dispatch_packet->reserved2 = 0;

   IREE_AMDGPU_TRACE_ZONE_END(z0);
   return dispatch_packet;
 }

 void iree_hal_amdgpu_device_buffer_fill_enqueue(
     const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
         context,
     void* target_ptr, const uint64_t length, const uint64_t pattern,
     const uint8_t pattern_length, uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr) {
   IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
   IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);
   IREE_AMDGPU_TRACE_ZONE_APPEND_VALUE_I64(z0, length);

   // Reserve and begin populating the operation packet.
   // When tracing is enabled capture the timing signal.
   iree_hsa_signal_t completion_signal = iree_hsa_signal_null();
   const uint64_t packet_id = iree_hal_amdgpu_device_blit_reserve(
       context, IREE_HAL_AMDGPU_TRACE_EXECUTION_ZONE_TYPE_FILL,
       &completion_signal);

   // Emplace the dispatch packet into the queue.
   // Note that until the packet is issued the queue will stall.
   iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT packet =
       iree_hal_amdgpu_device_buffer_fill_emplace_reserve(
           context, target_ptr, length, pattern, pattern_length, kernarg_ptr,
           packet_id);

   // Issues the buffer operation packet by configuring its header and signaling
   // the queue doorbell.
   iree_hal_amdgpu_device_blit_commit(context, packet_id, packet,
                                      completion_signal);

   IREE_AMDGPU_TRACE_ZONE_END(z0);
 }

 //===----------------------------------------------------------------------===//
 // iree_hal_amdgpu_device_buffer_copy_*
 //===----------------------------------------------------------------------===//

 IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_copy_x1(
     const uint8_t* IREE_AMDGPU_RESTRICT source_ptr,
     uint8_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length) {
   const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
   if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
     // Slowest possible copy; benchmarks required to iterate on better impls.
     target_ptr[element_offset] = source_ptr[element_offset];
   }
 }

 #define IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE sizeof(iree_amdgpu_uint64x2_t)
 #define IREE_HAL_AMDGPU_COPY_BLOCK_COUNT 8
 #define IREE_HAL_AMDGPU_COPY_BLOCK_SIZE \
   (IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE * IREE_HAL_AMDGPU_COPY_BLOCK_COUNT)

 // Copies a block of up to IREE_HAL_AMDGPU_COPY_BLOCK_COUNT 16-byte elements
 // from |source_ptr| to |target_ptr|. Requires an alignment of 16-bytes on all
 // of |source_ptr|, |target_ptr|, and |length|.
 //
 // Dispatched on a 2D grid with up to UINT32_MAX blocks on X.
 IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_copy_block_x16(
     const iree_amdgpu_uint64x2_t* IREE_AMDGPU_RESTRICT source_ptr,
     iree_amdgpu_uint64x2_t* IREE_AMDGPU_RESTRICT target_ptr,
     const uint64_t element_length) {
   const size_t block_id = iree_hal_amdgpu_blit_linear_id();
   const size_t element_offset = block_id * IREE_HAL_AMDGPU_COPY_BLOCK_COUNT;
   if (IREE_AMDGPU_UNLIKELY(element_offset >= element_length)) return;
   const size_t element_count = IREE_AMDGPU_MIN(IREE_HAL_AMDGPU_COPY_BLOCK_COUNT,
                                                element_length - element_offset);
   if (IREE_AMDGPU_LIKELY(element_count == IREE_HAL_AMDGPU_COPY_BLOCK_COUNT)) {
 #pragma unroll
     for (int i = 0; i < IREE_HAL_AMDGPU_COPY_BLOCK_COUNT; ++i) {
       target_ptr[element_offset + i] = source_ptr[element_offset + i];
     }
   } else {
     for (int i = 0; i < element_count; ++i) {
       target_ptr[element_offset + i] = source_ptr[element_offset + i];
     }
   }
 }

 // TODO(benvanik): experiment with enqueuing SDMA somehow (may need to take a
 // DMA queue as well as the dispatch queue). Note that on some configurations
 // (InfinityFabric) blit kernels can be 2x faster than SDMA.
 iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT
 iree_hal_amdgpu_device_buffer_copy_emplace_reserve(
     const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
         context,
     const void* source_ptr, void* target_ptr, const uint64_t length,
     uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr, const uint64_t packet_id) {
   IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
   IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);

   // Select the kernel for the copy operation.
   // TODO(benvanik): switch kernel based on source/target/length alignment.
   const iree_hal_amdgpu_device_kernel_args_t* IREE_AMDGPU_RESTRICT kernel_args =
       NULL;
   size_t element_size = 1;
   size_t block_size = 1;
   if (iree_amdgpu_has_alignment((size_t)source_ptr,
                                 IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE) &&
       iree_amdgpu_has_alignment((size_t)target_ptr,
                                 IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE) &&
       iree_amdgpu_has_alignment(length,
                                 IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE)) {
     IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "copy_block_x16");
     kernel_args =
         &context->kernels->iree_hal_amdgpu_device_buffer_copy_block_x16;
     element_size = IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE;
     block_size = IREE_HAL_AMDGPU_COPY_BLOCK_COUNT;
   } else {
     IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "copy_x1");
     kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_copy_x1;
     element_size = 1;
     block_size = 1;
   }

   // Update kernargs (same API for all kernels).
   const size_t element_count = length / element_size;
   iree_hal_amdgpu_device_buffer_copy_kernargs_t* kernargs =
       (iree_hal_amdgpu_device_buffer_copy_kernargs_t*)kernarg_ptr;
   kernargs->source_ptr = source_ptr;
   kernargs->target_ptr = target_ptr;
   kernargs->element_length = element_count;

   // To support transfers with more than UINT_MAX elements (uint32_t grid_size)
   // we divide the problem into chunks as needed. We keep the innermost chunk
   // size small as if we do [X,Y,1] we're likely to overshoot and don't want to
   // have too many wasted invocations.
   const size_t block_count = IREE_AMDGPU_CEIL_DIV(element_count, block_size);
   uint32_t grid_size_x = 1;
   uint32_t grid_size_y = 1;
   if (IREE_AMDGPU_LIKELY(block_count <= 0xFFFFFFFFu)) {
     grid_size_x = (uint32_t)block_count;
   } else {
     grid_size_x = 256;
     grid_size_y = (uint32_t)IREE_AMDGPU_CEIL_DIV(block_count, grid_size_x);
   }

   // Populate the packet.
   const uint64_t queue_mask = context->queue.size - 1;  // power of two
   iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT dispatch_packet =
       context->queue.base_address + (packet_id & queue_mask) * 64;
   dispatch_packet->setup = kernel_args->setup;
   dispatch_packet->workgroup_size[0] = kernel_args->workgroup_size[0];
   dispatch_packet->workgroup_size[1] = kernel_args->workgroup_size[1];
   dispatch_packet->workgroup_size[2] = kernel_args->workgroup_size[2];
   dispatch_packet->reserved0 = 0;
   dispatch_packet->grid_size[0] = grid_size_x;
   dispatch_packet->grid_size[1] = grid_size_y;
   dispatch_packet->grid_size[2] = 1;
   dispatch_packet->private_segment_size = kernel_args->private_segment_size;
   dispatch_packet->group_segment_size = kernel_args->group_segment_size;
   dispatch_packet->kernel_object = kernel_args->kernel_object;
   dispatch_packet->kernarg_address = kernarg_ptr;
   dispatch_packet->reserved2 = 0;

   IREE_AMDGPU_TRACE_ZONE_END(z0);
   return dispatch_packet;
 }

 void iree_hal_amdgpu_device_buffer_copy_enqueue(
     const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
         context,
     const void* source_ptr, void* target_ptr, const uint64_t length,
     uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr) {
   IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
   IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);
   IREE_AMDGPU_TRACE_ZONE_APPEND_VALUE_I64(z0, length);

   // Reserve and begin populating the operation packet.
   // When tracing is enabled capture the timing signal.
   iree_hsa_signal_t completion_signal = iree_hsa_signal_null();
   const uint64_t packet_id = iree_hal_amdgpu_device_blit_reserve(
       context, IREE_HAL_AMDGPU_TRACE_EXECUTION_ZONE_TYPE_COPY,
       &completion_signal);

   // Emplace the dispatch packet into the queue.
   // Note that until the packet is issued the queue will stall.
   iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT packet =
       iree_hal_amdgpu_device_buffer_copy_emplace_reserve(
           context, source_ptr, target_ptr, length, kernarg_ptr, packet_id);

   // Issues the buffer operation packet by configuring its header and signaling
   // the queue doorbell.
   iree_hal_amdgpu_device_blit_commit(context, packet_id, packet,
                                      completion_signal);

   IREE_AMDGPU_TRACE_ZONE_END(z0);
 }
	// Copyright 2025 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "iree/hal/drivers/amdgpu/device/blit.h"

	//===----------------------------------------------------------------------===//
	// Buffer transfer operation utilities
	//===----------------------------------------------------------------------===//

	// Reserves the next packet in the queue and returns its packet_id.
	// If tracing is enabled \|out_completion_signal\| will be populated with the
	// signal that must be attached to the operation.
	static uint64_t iree_hal_amdgpu_device_blit_reserve(
	const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
	context,
	iree_hal_amdgpu_trace_execution_zone_type_t zone_type,
	iree_hsa_signal_t* IREE_AMDGPU_RESTRICT out_completion_signal) {
	#if IREE_HAL_AMDGPU_HAS_TRACING_FEATURE( \
	IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION)
	if (context->trace_buffer) {
	iree_hal_amdgpu_trace_execution_query_id_t execution_query_id =
	iree_hal_amdgpu_device_query_ringbuffer_acquire(
	&context->trace_buffer->query_ringbuffer);
	*out_completion_signal =
	iree_hal_amdgpu_device_trace_execution_zone_dispatch(
	context->trace_buffer, zone_type, 0, execution_query_id);
	} else {
	*out_completion_signal = iree_hsa_signal_null();
	}
	#endif // IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION

	// Reserve the next packet in the queue.
	const uint64_t packet_id = iree_hsa_queue_add_write_index(
	&context->queue, 1u, iree_amdgpu_memory_order_relaxed);
	while (packet_id - iree_hsa_queue_load_read_index(
	&context->queue, iree_amdgpu_memory_order_acquire) >=
	context->queue.size) {
	iree_amdgpu_yield(); // spinning
	}

	return packet_id;
	}

	// Commits a reserved transfer packet.
	// The header will be updated and the target queue doorbell will be signaled.
	static void iree_hal_amdgpu_device_blit_commit(
	const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
	context,
	uint64_t packet_id,
	iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT packet,
	iree_hsa_signal_t completion_signal) {
	// Chain completion.
	packet->completion_signal = completion_signal;

	// Populate the header and release the packet to the queue.
	uint16_t header = IREE_HSA_PACKET_TYPE_KERNEL_DISPATCH
	<< IREE_HSA_PACKET_HEADER_TYPE;

	// TODO(benvanik): need to pull in barrier/scope overrides from command buffer
	// execution context flags. They should override the barrier bit and the
	// scopes to be on SYSTEM regardless of what we choose here.

	// NOTE: we don't need a barrier bit as the caller is expecting it to run
	// concurrently if needed.
	header \|= 0 << IREE_HSA_PACKET_HEADER_BARRIER;

	#if IREE_HAL_AMDGPU_HAS_TRACING_FEATURE( \
	IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION)
	if (context->trace_buffer) {
	// Force a barrier bit if we are tracing execution. This ensures that we get
	// exclusive timing for the operation.
	header \|= 1 << IREE_HSA_PACKET_HEADER_BARRIER;
	}
	#endif // IREE_HAL_AMDGPU_TRACING_FEATURE_DEVICE_EXECUTION

	// TODO(benvanik): scope to agent if the pointer is local, or maybe none in
	// cases where surrounding barriers performed the cache management.
	header \|= IREE_HSA_FENCE_SCOPE_SYSTEM
	<< IREE_HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE;
	header \|= IREE_HSA_FENCE_SCOPE_SYSTEM
	<< IREE_HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE;

	const uint32_t header_setup = header \| (uint32_t)(packet->setup << 16);
	iree_amdgpu_scoped_atomic_store(
	(iree_amdgpu_scoped_atomic_uint32_t*)packet, header_setup,
	iree_amdgpu_memory_order_release, iree_amdgpu_memory_scope_system);

	// Signal the queue doorbell indicating the packet has been updated.
	iree_hsa_signal_store(context->queue.doorbell_signal, packet_id,
	iree_amdgpu_memory_order_relaxed);
	}

	//===----------------------------------------------------------------------===//
	// Blit kernel utilities
	//===----------------------------------------------------------------------===//

	// 2 uint64_t values totaling 16 bytes.
	typedef uint32_t iree_amdgpu_uint64x2_t __attribute__((vector_size(16)));

	static inline size_t iree_hal_amdgpu_blit_linear_id(void) {
	const size_t id_x = iree_hal_amdgpu_device_group_id_x() *
	IREE_HAL_AMDGPU_BLIT_WORKGROUP_SIZE_X +
	iree_hal_amdgpu_device_local_id_x();
	const size_t id_y = iree_hal_amdgpu_device_group_id_y() *
	IREE_HAL_AMDGPU_BLIT_WORKGROUP_SIZE_Y +
	iree_hal_amdgpu_device_local_id_y();
	return id_y * iree_amdgcn_dispatch_ptr()->grid_size[0] + id_x;
	}

	//===----------------------------------------------------------------------===//
	// iree_hal_amdgpu_device_buffer_fill_*
	//===----------------------------------------------------------------------===//

	IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x1(
	uint8_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
	const uint8_t pattern) {
	const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
	if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
	// Slowest possible copy; benchmarks required to iterate on better impls.
	target_ptr[element_offset] = pattern;
	}
	}

	IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x2(
	uint16_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
	const uint16_t pattern) {
	const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
	if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
	// Slowest possible fill; benchmarks required to iterate on better impls.
	target_ptr[element_offset] = pattern;
	}
	}

	IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x4(
	uint32_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
	const uint32_t pattern) {
	const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
	if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
	// Slowest possible fill; benchmarks required to iterate on better impls.
	target_ptr[element_offset] = pattern;
	}
	}

	IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_x8(
	uint64_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length,
	const uint64_t pattern) {
	const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
	if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
	// Slowest possible fill; benchmarks required to iterate on better impls.
	target_ptr[element_offset] = pattern;
	}
	}

	#define IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE sizeof(iree_amdgpu_uint64x2_t)
	#define IREE_HAL_AMDGPU_FILL_BLOCK_COUNT 8
	#define IREE_HAL_AMDGPU_FILL_BLOCK_SIZE \
	(IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE * IREE_HAL_AMDGPU_FILL_BLOCK_COUNT)

	// Fills a block of up to IREE_HAL_AMDGPU_FILL_BLOCK_COUNT 16-byte elements with
	// a fixed pattern. Requires an alignment of 16-bytes on both the \|target_ptr\|
	// and \|length\|.
	IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_fill_block_x16(
	iree_amdgpu_uint64x2_t* IREE_AMDGPU_RESTRICT target_ptr,
	const uint64_t element_length, const uint64_t pattern) {
	const size_t block_id = iree_hal_amdgpu_blit_linear_id();
	const size_t element_offset = block_id * IREE_HAL_AMDGPU_FILL_BLOCK_COUNT;
	if (IREE_AMDGPU_UNLIKELY(element_offset >= element_length)) return;
	iree_amdgpu_uint64x2_t pattern_x16 = {pattern, pattern};
	const size_t element_count =
	IREE_AMDGPU_MIN(IREE_HAL_AMDGPU_FILL_BLOCK_COUNT,
	element_length - element_offset) /
	sizeof(pattern_x16);
	if (IREE_AMDGPU_LIKELY(element_count == IREE_HAL_AMDGPU_FILL_BLOCK_COUNT)) {
	#pragma unroll
	for (int i = 0; i < IREE_HAL_AMDGPU_FILL_BLOCK_COUNT; ++i) {
	target_ptr[element_offset + i] = pattern_x16;
	}
	} else {
	for (int i = 0; i < element_count; ++i) {
	target_ptr[element_offset + i] = pattern_x16;
	}
	}
	}

	// Returns the bytes of \|pattern\| of length \|pattern_length\| splatted to
	// an 8-byte value.
	static uint64_t iree_hal_amdgpu_device_extend_pattern_x8(
	const uint64_t pattern, const uint8_t pattern_length) {
	switch (pattern_length) {
	case 1:
	return ((uint64_t)pattern << 56) \| ((uint64_t)pattern << 48) \|
	((uint64_t)pattern << 40) \| ((uint64_t)pattern << 32) \|
	((uint64_t)pattern << 24) \| ((uint64_t)pattern << 16) \|
	((uint64_t)pattern << 8) \| pattern;
	case 2:
	return ((uint64_t)pattern << 48) \| ((uint64_t)pattern << 32) \|
	((uint64_t)pattern << 16) \| pattern;
	case 4:
	return ((uint64_t)pattern << 32) \| pattern;
	case 8:
	return pattern;
	default:
	return 0;
	}
	}

	iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT
	iree_hal_amdgpu_device_buffer_fill_emplace_reserve(
	const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
	context,
	void* target_ptr, const uint64_t length, uint64_t pattern,
	const uint8_t pattern_length, uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr,
	const uint64_t packet_id) {
	IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
	IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);

	// Select the kernel for the fill operation.
	const iree_hal_amdgpu_device_kernel_args_t* IREE_AMDGPU_RESTRICT kernel_args =
	NULL;
	size_t element_size = 1;
	size_t block_size = 1;
	if (iree_amdgpu_has_alignment((size_t)target_ptr,
	IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE) &&
	iree_amdgpu_has_alignment(length,
	IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE)) {
	IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_block_x16");
	pattern = iree_hal_amdgpu_device_extend_pattern_x8(pattern, pattern_length);
	kernel_args =
	&context->kernels->iree_hal_amdgpu_device_buffer_fill_block_x16;
	element_size = IREE_HAL_AMDGPU_FILL_BLOCK_ELEMENT_SIZE;
	block_size = IREE_HAL_AMDGPU_FILL_BLOCK_COUNT;
	} else {
	switch (pattern_length) {
	case 1:
	IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x1");
	kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x1;
	break;
	case 2:
	IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x2");
	kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x2;
	break;
	case 4:
	IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x4");
	kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x4;
	break;
	case 8:
	IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "fill_x8");
	kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_fill_x8;
	break;
	}
	element_size = pattern_length;
	block_size = 1;
	}

	// Update kernargs (same API for all kernels).
	const size_t element_count = length / element_size;
	iree_hal_amdgpu_device_buffer_fill_kernargs_t* kernargs =
	(iree_hal_amdgpu_device_buffer_fill_kernargs_t*)kernarg_ptr;
	kernargs->target_ptr = target_ptr;
	kernargs->element_length = element_count;
	kernargs->pattern = pattern;

	// To support fills with more than UINT_MAX elements (uint32_t grid_size)
	// we divide the problem into chunks as needed. We keep the innermost chunk
	// size small as if we do [X,Y,1] we're likely to overshoot and don't want to
	// have too many wasted invocations.
	const size_t block_count = IREE_AMDGPU_CEIL_DIV(element_count, block_size);
	uint32_t grid_size_x = 1;
	uint32_t grid_size_y = 1;
	if (IREE_AMDGPU_LIKELY(block_count <= 0xFFFFFFFFu)) {
	grid_size_x = (uint32_t)block_count;
	} else {
	grid_size_x = 256;
	grid_size_y = (uint32_t)IREE_AMDGPU_CEIL_DIV(block_count, grid_size_x);
	}

	// Populate the packet.
	const uint64_t queue_mask = context->queue.size - 1; // power of two
	iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT dispatch_packet =
	context->queue.base_address + (packet_id & queue_mask) * 64;
	dispatch_packet->setup = kernel_args->setup;
	dispatch_packet->workgroup_size[0] = kernel_args->workgroup_size[0];
	dispatch_packet->workgroup_size[1] = kernel_args->workgroup_size[1];
	dispatch_packet->workgroup_size[2] = kernel_args->workgroup_size[2];
	dispatch_packet->reserved0 = 0;
	dispatch_packet->grid_size[0] = grid_size_x;
	dispatch_packet->grid_size[1] = grid_size_y;
	dispatch_packet->grid_size[2] = 1;
	dispatch_packet->private_segment_size = kernel_args->private_segment_size;
	dispatch_packet->group_segment_size = kernel_args->group_segment_size;
	dispatch_packet->kernel_object = kernel_args->kernel_object;
	dispatch_packet->kernarg_address = kernarg_ptr;
	dispatch_packet->reserved2 = 0;

	IREE_AMDGPU_TRACE_ZONE_END(z0);
	return dispatch_packet;
	}

	void iree_hal_amdgpu_device_buffer_fill_enqueue(
	const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
	context,
	void* target_ptr, const uint64_t length, const uint64_t pattern,
	const uint8_t pattern_length, uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr) {
	IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
	IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);
	IREE_AMDGPU_TRACE_ZONE_APPEND_VALUE_I64(z0, length);

	// Reserve and begin populating the operation packet.
	// When tracing is enabled capture the timing signal.
	iree_hsa_signal_t completion_signal = iree_hsa_signal_null();
	const uint64_t packet_id = iree_hal_amdgpu_device_blit_reserve(
	context, IREE_HAL_AMDGPU_TRACE_EXECUTION_ZONE_TYPE_FILL,
	&completion_signal);

	// Emplace the dispatch packet into the queue.
	// Note that until the packet is issued the queue will stall.
	iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT packet =
	iree_hal_amdgpu_device_buffer_fill_emplace_reserve(
	context, target_ptr, length, pattern, pattern_length, kernarg_ptr,
	packet_id);

	// Issues the buffer operation packet by configuring its header and signaling
	// the queue doorbell.
	iree_hal_amdgpu_device_blit_commit(context, packet_id, packet,
	completion_signal);

	IREE_AMDGPU_TRACE_ZONE_END(z0);
	}

	//===----------------------------------------------------------------------===//
	// iree_hal_amdgpu_device_buffer_copy_*
	//===----------------------------------------------------------------------===//

	IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_copy_x1(
	const uint8_t* IREE_AMDGPU_RESTRICT source_ptr,
	uint8_t* IREE_AMDGPU_RESTRICT target_ptr, const uint64_t element_length) {
	const size_t element_offset = iree_hal_amdgpu_blit_linear_id();
	if (IREE_AMDGPU_LIKELY(element_offset < element_length)) {
	// Slowest possible copy; benchmarks required to iterate on better impls.
	target_ptr[element_offset] = source_ptr[element_offset];
	}
	}

	#define IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE sizeof(iree_amdgpu_uint64x2_t)
	#define IREE_HAL_AMDGPU_COPY_BLOCK_COUNT 8
	#define IREE_HAL_AMDGPU_COPY_BLOCK_SIZE \
	(IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE * IREE_HAL_AMDGPU_COPY_BLOCK_COUNT)

	// Copies a block of up to IREE_HAL_AMDGPU_COPY_BLOCK_COUNT 16-byte elements
	// from \|source_ptr\| to \|target_ptr\|. Requires an alignment of 16-bytes on all
	// of \|source_ptr\|, \|target_ptr\|, and \|length\|.
	//
	// Dispatched on a 2D grid with up to UINT32_MAX blocks on X.
	IREE_AMDGPU_ATTRIBUTE_KERNEL void iree_hal_amdgpu_device_buffer_copy_block_x16(
	const iree_amdgpu_uint64x2_t* IREE_AMDGPU_RESTRICT source_ptr,
	iree_amdgpu_uint64x2_t* IREE_AMDGPU_RESTRICT target_ptr,
	const uint64_t element_length) {
	const size_t block_id = iree_hal_amdgpu_blit_linear_id();
	const size_t element_offset = block_id * IREE_HAL_AMDGPU_COPY_BLOCK_COUNT;
	if (IREE_AMDGPU_UNLIKELY(element_offset >= element_length)) return;
	const size_t element_count = IREE_AMDGPU_MIN(IREE_HAL_AMDGPU_COPY_BLOCK_COUNT,
	element_length - element_offset);
	if (IREE_AMDGPU_LIKELY(element_count == IREE_HAL_AMDGPU_COPY_BLOCK_COUNT)) {
	#pragma unroll
	for (int i = 0; i < IREE_HAL_AMDGPU_COPY_BLOCK_COUNT; ++i) {
	target_ptr[element_offset + i] = source_ptr[element_offset + i];
	}
	} else {
	for (int i = 0; i < element_count; ++i) {
	target_ptr[element_offset + i] = source_ptr[element_offset + i];
	}
	}
	}

	// TODO(benvanik): experiment with enqueuing SDMA somehow (may need to take a
	// DMA queue as well as the dispatch queue). Note that on some configurations
	// (InfinityFabric) blit kernels can be 2x faster than SDMA.
	iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT
	iree_hal_amdgpu_device_buffer_copy_emplace_reserve(
	const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
	context,
	const void* source_ptr, void* target_ptr, const uint64_t length,
	uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr, const uint64_t packet_id) {
	IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
	IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);

	// Select the kernel for the copy operation.
	// TODO(benvanik): switch kernel based on source/target/length alignment.
	const iree_hal_amdgpu_device_kernel_args_t* IREE_AMDGPU_RESTRICT kernel_args =
	NULL;
	size_t element_size = 1;
	size_t block_size = 1;
	if (iree_amdgpu_has_alignment((size_t)source_ptr,
	IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE) &&
	iree_amdgpu_has_alignment((size_t)target_ptr,
	IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE) &&
	iree_amdgpu_has_alignment(length,
	IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE)) {
	IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "copy_block_x16");
	kernel_args =
	&context->kernels->iree_hal_amdgpu_device_buffer_copy_block_x16;
	element_size = IREE_HAL_AMDGPU_COPY_BLOCK_ELEMENT_SIZE;
	block_size = IREE_HAL_AMDGPU_COPY_BLOCK_COUNT;
	} else {
	IREE_AMDGPU_TRACE_ZONE_APPEND_TEXT_LITERAL(z0, "copy_x1");
	kernel_args = &context->kernels->iree_hal_amdgpu_device_buffer_copy_x1;
	element_size = 1;
	block_size = 1;
	}

	// Update kernargs (same API for all kernels).
	const size_t element_count = length / element_size;
	iree_hal_amdgpu_device_buffer_copy_kernargs_t* kernargs =
	(iree_hal_amdgpu_device_buffer_copy_kernargs_t*)kernarg_ptr;
	kernargs->source_ptr = source_ptr;
	kernargs->target_ptr = target_ptr;
	kernargs->element_length = element_count;

	// To support transfers with more than UINT_MAX elements (uint32_t grid_size)
	// we divide the problem into chunks as needed. We keep the innermost chunk
	// size small as if we do [X,Y,1] we're likely to overshoot and don't want to
	// have too many wasted invocations.
	const size_t block_count = IREE_AMDGPU_CEIL_DIV(element_count, block_size);
	uint32_t grid_size_x = 1;
	uint32_t grid_size_y = 1;
	if (IREE_AMDGPU_LIKELY(block_count <= 0xFFFFFFFFu)) {
	grid_size_x = (uint32_t)block_count;
	} else {
	grid_size_x = 256;
	grid_size_y = (uint32_t)IREE_AMDGPU_CEIL_DIV(block_count, grid_size_x);
	}

	// Populate the packet.
	const uint64_t queue_mask = context->queue.size - 1; // power of two
	iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT dispatch_packet =
	context->queue.base_address + (packet_id & queue_mask) * 64;
	dispatch_packet->setup = kernel_args->setup;
	dispatch_packet->workgroup_size[0] = kernel_args->workgroup_size[0];
	dispatch_packet->workgroup_size[1] = kernel_args->workgroup_size[1];
	dispatch_packet->workgroup_size[2] = kernel_args->workgroup_size[2];
	dispatch_packet->reserved0 = 0;
	dispatch_packet->grid_size[0] = grid_size_x;
	dispatch_packet->grid_size[1] = grid_size_y;
	dispatch_packet->grid_size[2] = 1;
	dispatch_packet->private_segment_size = kernel_args->private_segment_size;
	dispatch_packet->group_segment_size = kernel_args->group_segment_size;
	dispatch_packet->kernel_object = kernel_args->kernel_object;
	dispatch_packet->kernarg_address = kernarg_ptr;
	dispatch_packet->reserved2 = 0;

	IREE_AMDGPU_TRACE_ZONE_END(z0);
	return dispatch_packet;
	}

	void iree_hal_amdgpu_device_buffer_copy_enqueue(
	const iree_hal_amdgpu_device_buffer_transfer_context_t* IREE_AMDGPU_RESTRICT
	context,
	const void* source_ptr, void* target_ptr, const uint64_t length,
	uint64_t* IREE_AMDGPU_RESTRICT kernarg_ptr) {
	IREE_AMDGPU_TRACE_BUFFER_SCOPE(context->trace_buffer);
	IREE_AMDGPU_TRACE_ZONE_BEGIN(z0);
	IREE_AMDGPU_TRACE_ZONE_APPEND_VALUE_I64(z0, length);

	// Reserve and begin populating the operation packet.
	// When tracing is enabled capture the timing signal.
	iree_hsa_signal_t completion_signal = iree_hsa_signal_null();
	const uint64_t packet_id = iree_hal_amdgpu_device_blit_reserve(
	context, IREE_HAL_AMDGPU_TRACE_EXECUTION_ZONE_TYPE_COPY,
	&completion_signal);

	// Emplace the dispatch packet into the queue.
	// Note that until the packet is issued the queue will stall.
	iree_hsa_kernel_dispatch_packet_t* IREE_AMDGPU_RESTRICT packet =
	iree_hal_amdgpu_device_buffer_copy_emplace_reserve(
	context, source_ptr, target_ptr, length, kernarg_ptr, packet_id);

	// Issues the buffer operation packet by configuring its header and signaling
	// the queue doorbell.
	iree_hal_amdgpu_device_blit_commit(context, packet_id, packet,
	completion_signal);

	IREE_AMDGPU_TRACE_ZONE_END(z0);
	}