[metal] Add builtin executable for copy unaligned buffers
For buffer copy on macOS, Metal require source/destination offset
and length to be some multiple of 4. So for unaligned cases, we
need a builtin executable to do the copy by ourselves.
This commit adds such a kernel. It copies 1 byte per thread, which
should work for Metal 3 devices. The performance won't likely be
good though; however, this is just a fallback and we can optimize
cases like fill buffer builtin executables if necessary.
diff --git a/experimental/metal/direct_command_buffer.m b/experimental/metal/direct_command_buffer.m
index 7f9d81b..1ab3583 100644
--- a/experimental/metal/direct_command_buffer.m
+++ b/experimental/metal/direct_command_buffer.m
@@ -456,6 +456,37 @@
return status;
}
+static iree_status_t iree_hal_metal_command_buffer_copy_buffer_internal(
+ iree_hal_metal_command_buffer_t* command_buffer, id<MTLBuffer> source_device_buffer,
+ iree_device_size_t source_offset, id<MTLBuffer> target_device_buffer,
+ iree_device_size_t target_offset, iree_device_size_t length) {
+ // Per the spec for copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size, the source/target
+ // offset and length must be a multiple of 4 bytes in macOS, and 1 byte in iOS and tvOS.
+#if defined(IREE_PLATFORM_MACOS)
+ bool can_use_metal_api = source_offset % 4 == 0 && target_offset % 4 == 0 && length % 4 == 0;
+#else
+ bool can_use_metal_api = true;
+#endif
+
+ iree_status_t status = iree_ok_status();
+ if (can_use_metal_api) {
+ id<MTLBlitCommandEncoder> encoder = iree_hal_metal_get_or_begin_blit_encoder(command_buffer);
+ [encoder copyFromBuffer:source_device_buffer
+ sourceOffset:source_offset
+ toBuffer:target_device_buffer
+ destinationOffset:target_offset
+ size:length];
+ } else {
+ id<MTLComputeCommandEncoder> encoder =
+ iree_hal_metal_get_or_begin_compute_encoder(command_buffer);
+ status = iree_hal_metal_builtin_executable_copy_buffer(
+ command_buffer->builtin_executable, encoder, source_device_buffer, source_offset,
+ target_device_buffer, target_offset, length);
+ }
+
+ return status;
+}
+
static iree_status_t iree_hal_metal_command_buffer_update_buffer(
iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
@@ -466,16 +497,6 @@
iree_hal_metal_command_buffer_cast(base_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
- // Per the spec for copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size, the source/target
- // offset and length must be a multiple of 4 bytes in macOS, and 1 byte in iOS and tvOS.
-#if defined(IREE_PLATFORM_MACOS)
- if (source_offset % 4 != 0 || target_offset % 4 != 0 || length % 4 != 0) {
- IREE_TRACE_ZONE_END(z0);
- return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
- "unimplemented buffer update with non-4-multiple offset/length");
- }
-#endif
-
id<MTLDevice> device = command_buffer->command_buffer.device;
MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
id<MTLBuffer> data_buffer = [device newBufferWithBytes:((uint8_t*)source_buffer + source_offset)
@@ -492,15 +513,12 @@
iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(target_buffer));
target_offset += iree_hal_buffer_byte_offset(target_buffer);
- id<MTLBlitCommandEncoder> encoder = iree_hal_metal_get_or_begin_blit_encoder(command_buffer);
- [encoder copyFromBuffer:data_buffer
- sourceOffset:0
- toBuffer:target_device_buffer
- destinationOffset:target_offset
- size:length];
+ iree_status_t status = iree_hal_metal_command_buffer_copy_buffer_internal(
+ command_buffer, data_buffer, /*source_offset=*/0, target_device_buffer, target_offset,
+ length);
IREE_TRACE_ZONE_END(z0);
- return iree_ok_status();
+ return status;
}
static iree_status_t iree_hal_metal_command_buffer_copy_buffer(
@@ -511,16 +529,6 @@
iree_hal_metal_command_buffer_cast(base_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
- // Per the spec for copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size, the source/target
- // offset and length must be a multiple of 4 bytes in macOS, and 1 byte in iOS and tvOS.
-#if defined(IREE_PLATFORM_MACOS)
- if (source_offset % 4 != 0 || target_offset % 4 != 0 || length % 4 != 0) {
- IREE_TRACE_ZONE_END(z0);
- return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
- "unimplemented copy buffer with non-4-multiple offset/length");
- }
-#endif
-
const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
@@ -533,14 +541,12 @@
source_offset += iree_hal_buffer_byte_offset(source_buffer);
target_offset += iree_hal_buffer_byte_offset(target_buffer);
- id<MTLBlitCommandEncoder> encoder = iree_hal_metal_get_or_begin_blit_encoder(command_buffer);
- [encoder copyFromBuffer:source_device_buffer
- sourceOffset:source_offset
- toBuffer:target_device_buffer
- destinationOffset:target_offset
- size:length];
+ iree_status_t status = iree_hal_metal_command_buffer_copy_buffer_internal(
+ command_buffer, source_device_buffer, source_offset, target_device_buffer, target_offset,
+ length);
- return iree_ok_status();
+ IREE_TRACE_ZONE_END(z0);
+ return status;
}
static iree_status_t iree_hal_metal_command_buffer_collective(