[metal] Use staging buffer for argument buffers and update sources
This commit switches the Metal HAL driver to use a staging buffer
for recording argument buffers and uploading buffer update source
data. This avoids creating lots of small-sized buffers like we
did previously, and avoids using command buffer completion callback
to handle their lifetime.
diff --git a/experimental/metal/direct_command_buffer.m b/experimental/metal/direct_command_buffer.m
index 7117969..a6f3fbf 100644
--- a/experimental/metal/direct_command_buffer.m
+++ b/experimental/metal/direct_command_buffer.m
@@ -13,6 +13,7 @@
#include "experimental/metal/metal_device.h"
#include "experimental/metal/metal_kernel_library.h"
#include "experimental/metal/pipeline_layout.h"
+#include "experimental/metal/staging_buffer.h"
#include "iree/base/api.h"
#include "iree/base/target_platform.h"
#include "iree/base/tracing.h"
@@ -168,6 +169,10 @@
// Arena used for all allocations; references the shared device block pool.
iree_arena_allocator_t arena;
+ // Per-queue shared uniform staging buffer for uploading parameters to the GPU, including argument
+ // buffers and buffer update source buffers.
+ iree_hal_metal_staging_buffer_t* staging_buffer;
+
// Linked list of command segments to be recorded into a command buffer.
iree_hal_metal_command_segment_list_t segments;
@@ -319,6 +324,7 @@
iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity,
iree_hal_metal_command_buffer_resource_reference_mode_t resource_reference_mode,
id<MTLCommandQueue> queue, iree_allocator_t host_allocator, iree_arena_block_pool_t* block_pool,
+ iree_hal_metal_staging_buffer_t* staging_buffer,
iree_hal_metal_builtin_executable_t* builtin_executable,
iree_hal_command_buffer_t** out_command_buffer) {
IREE_ASSERT_ARGUMENT(device);
@@ -344,6 +350,7 @@
command_buffer->queue = [queue retain]; // +1
command_buffer->builtin_executable = builtin_executable;
iree_arena_initialize(block_pool, &command_buffer->arena);
+ command_buffer->staging_buffer = staging_buffer;
iree_hal_metal_command_segment_list_reset(&command_buffer->segments);
@autoreleasepool { // Use @autoreleasepool to trigger the autorelease within encoder creation.
// We track resource lifetime by ourselves in IREE; so just do unretained references to
@@ -737,6 +744,7 @@
segment->target_buffer, segment->target_offset, segment->length);
}
+ IREE_TRACE_ZONE_END(z0);
return status;
}
@@ -744,20 +752,19 @@
iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
iree_device_size_t target_offset, iree_device_size_t length) {
- // There are no direct corresponding APIs in Metal. We emulate it by creating a buffer with the
- // content and then copy it over.
iree_hal_metal_command_buffer_t* command_buffer =
iree_hal_metal_command_buffer_cast(base_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
- id<MTLDevice> device = command_buffer->command_buffer.device;
- MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
- id<MTLBuffer> data_buffer = [device newBufferWithBytes:((uint8_t*)source_buffer + source_offset)
- length:length
- options:options]; // +1
- [command_buffer->command_buffer addCompletedHandler:^(id<MTLCommandBuffer> cmdbuf) {
- [data_buffer release]; // -1
- }];
+ // There are no direct corresponding APIs in Metal. We update the source buffer data to the
+ // staging buffer and then copy over.
+
+ iree_const_byte_span_t source_data_span =
+ iree_make_const_byte_span((uint8_t*)source_buffer + source_offset, length);
+ uint32_t offset = 0;
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_hal_metal_staging_buffer_append(command_buffer->staging_buffer, source_data_span,
+ /*alignment=*/4, &offset));
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, &target_buffer));
@@ -767,8 +774,8 @@
target_offset += iree_hal_buffer_byte_offset(target_buffer);
iree_status_t status = iree_hal_metal_command_segment_create_copy_buffer(
- command_buffer, data_buffer, /*source_offset=*/0, target_device_buffer, target_offset,
- length);
+ command_buffer, command_buffer->staging_buffer->metal_buffer, offset, target_device_buffer,
+ target_offset, length);
IREE_TRACE_ZONE_END(z0);
return status;
@@ -931,37 +938,6 @@
return iree_ok_status();
}
-// Creates an argument encoder and its backing argument buffer for the given kernel |function|'s
-// |buffer_index|. The argument encoder will be set to encode into the newly created argument
-// buffer. Callers are expected to release both the argument encoder and buffer.
-static iree_status_t iree_hal_metal_create_argument_encoder(
- id<MTLDevice> device, id<MTLCommandBuffer> command_buffer, id<MTLFunction> function,
- uint32_t buffer_index, id<MTLArgumentEncoder>* out_encoder, id<MTLBuffer>* out_buffer) {
- id<MTLArgumentEncoder> argument_encoder =
- [function newArgumentEncoderWithBufferIndex:buffer_index]; // +1
- IREE_ASSERT(argument_encoder != nil);
-
- __block id<MTLBuffer> argument_buffer =
- [device newBufferWithLength:argument_encoder.encodedLength
- options:MTLResourceStorageModeShared]; // +1
- if (!argument_buffer) {
- return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
- "failed to create argument buffer with size = %ld bytes",
- argument_encoder.encodedLength);
- }
-
- // The arugment encoder and buffer can be deleted once the command buffer completes.
- [command_buffer addCompletedHandler:^(id<MTLCommandBuffer> cmdbuf) {
- [argument_buffer release]; // -1
- [argument_encoder release]; // -1
- }];
-
- [argument_encoder setArgumentBuffer:argument_buffer offset:0];
- *out_encoder = argument_encoder;
- *out_buffer = argument_buffer;
- return iree_ok_status();
-}
-
// Prepares kernels and argument buffers needed for kernel dispatches.
static iree_status_t iree_hal_metal_command_segment_create_dispatch(
iree_hal_command_buffer_t* base_command_buffer, iree_hal_executable_t* executable,
@@ -1033,17 +1009,23 @@
// Record argument buffers for all descriptors and record buffer usages.
iree_hal_metal_descriptor_t* descriptors = segment->descriptors;
- iree_host_size_t i = 0;
- while (i < segment->descriptor_count) {
+ for (iree_host_size_t i = 0; i < segment->descriptor_count;) {
uint32_t current_set = descriptors[i].set;
// Build argument encoder and argument buffer for the current descriptor set.
- id<MTLArgumentEncoder> argument_encoder;
- id<MTLBuffer> argument_buffer;
+ id<MTLBuffer> argument_buffer = command_buffer->staging_buffer->metal_buffer;
+ id<MTLArgumentEncoder> argument_encoder =
+ [segment->kernel_params.function newArgumentEncoderWithBufferIndex:current_set]; // +1
+ IREE_ASSERT(argument_encoder != nil);
+
+ // Reserve space for the argument buffer from shared staging buffer.
+ iree_byte_span_t reservation;
+ uint32_t argument_buffer_offset;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
- z0, iree_hal_metal_create_argument_encoder(
- command_buffer->command_buffer.device, command_buffer->command_buffer,
- segment->kernel_params.function, current_set, &argument_encoder, &argument_buffer));
+ z0, iree_hal_metal_staging_buffer_reserve(
+ command_buffer->staging_buffer, argument_encoder.encodedLength,
+ argument_encoder.alignment, &reservation, &argument_buffer_offset));
+ [argument_encoder setArgumentBuffer:argument_buffer offset:argument_buffer_offset];
// Now record all bound buffers belonging to the current set into the argument buffer.
for (; i < segment->descriptor_count && descriptors[i].set == current_set; ++i) {
@@ -1058,7 +1040,9 @@
[compute_encoder useResource:current_buffer usage:descriptors[i].usage];
}
// Record the argument buffer.
- [compute_encoder setBuffer:argument_buffer offset:0 atIndex:current_set];
+ [compute_encoder setBuffer:argument_buffer offset:argument_buffer_offset atIndex:current_set];
+
+ [argument_encoder release]; // -1
}
// Record the dispatch, either direct or indirect.