[metal] Add builtin executable for copy unaligned buffers
For buffer copy on macOS, Metal require source/destination offset
and length to be some multiple of 4. So for unaligned cases, we
need a builtin executable to do the copy by ourselves.
This commit adds such a kernel. It copies 1 byte per thread, which
should work for Metal 3 devices. The performance won't likely be
good though; however, this is just a fallback and we can optimize
cases like fill buffer builtin executables if necessary.
diff --git a/experimental/metal/builtin/CMakeLists.txt b/experimental/metal/builtin/CMakeLists.txt
index e41cc83..5c0c0e1 100644
--- a/experimental/metal/builtin/CMakeLists.txt
+++ b/experimental/metal/builtin/CMakeLists.txt
@@ -10,13 +10,14 @@
NAME
builtin
SRCS
+ "copy_buffer_generic.metal"
"fill_buffer_generic.metal"
C_FILE_OUTPUT
- "fill_buffer_generic.c"
+ "metal_buffer_kernels.c"
H_FILE_OUTPUT
- "fill_buffer_generic.h"
+ "metal_buffer_kernels.h"
IDENTIFIER
- "fill_buffer_generic"
+ "metal_buffer_kernels"
FLATTEN
PUBLIC
)
diff --git a/experimental/metal/builtin/copy_buffer_generic.metal b/experimental/metal/builtin/copy_buffer_generic.metal
new file mode 100644
index 0000000..ea66a5d
--- /dev/null
+++ b/experimental/metal/builtin/copy_buffer_generic.metal
@@ -0,0 +1,27 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Polyfill Metal kernels for buffer copies without 4-byte aligned offsets / lengths.
+
+struct CopySpec {
+ uint64_t src_buffer_offset; // Source buffer offset (in bytes)
+ uint64_t dst_buffer_offset; // Destination buffer offset (in bytes)
+ uint64_t length; // Buffer length to fill (in bytes)
+};
+
+// Copies data from |src_buffer| to |dst_buffer| with the given |spec|ification.
+//
+// No alignment requirement on source/destination buffer offset and length.
+// Each thread copies over one byte. This won't be good for perf; so just a fallback.
+kernel void copy_buffer_1byte(
+ device uint8_t *src_buffer [[buffer(0)]],
+ device uint8_t *dst_buffer [[buffer(1)]],
+ constant CopySpec &spec [[buffer(2)]],
+ uint id [[thread_position_in_grid]]
+) {
+ if (id >= spec.length) return;
+ dst_buffer[spec.dst_buffer_offset + id] = src_buffer[spec.src_buffer_offset + id];
+}
diff --git a/experimental/metal/builtin_executables.h b/experimental/metal/builtin_executables.h
index 5e26325..5c9c07e 100644
--- a/experimental/metal/builtin_executables.h
+++ b/experimental/metal/builtin_executables.h
@@ -47,6 +47,18 @@
iree_device_size_t target_offset, iree_device_size_t length,
uint32_t pattern);
+// Copies the |source_buffer| at |source_offset| to the |target_buffer| at
+// |target_offset| of |length| using builtin executables dispatched via
+// |encoder|.
+//
+// Under the hood, this will record all necessary commands to bind kernel
+// objects and buffer resources, and the perform dispatch.
+iree_status_t iree_hal_metal_builtin_executable_copy_buffer(
+ const iree_hal_metal_builtin_executable_t* executable,
+ id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> source_buffer,
+ iree_device_size_t source_offset, id<MTLBuffer> target_buffer,
+ iree_device_size_t target_offset, iree_device_size_t length);
+
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
diff --git a/experimental/metal/builtin_executables.m b/experimental/metal/builtin_executables.m
index 1cc8000..287538e 100644
--- a/experimental/metal/builtin_executables.m
+++ b/experimental/metal/builtin_executables.m
@@ -7,17 +7,25 @@
#include "experimental/metal/builtin_executables.h"
#include <string.h>
-#include "experimental/metal/builtin/fill_buffer_generic.h"
+
+#include "experimental/metal/builtin/metal_buffer_kernels.h"
#include "iree/base/api.h"
#include "iree/base/tracing.h"
#include "iree/hal/api.h"
-// The list of builtin executable entry points. This MUST be consistent with kernel function names
-// in MSL source code.
-static const char* iree_hal_metal_builtin_executable_entry_points[] = {
- "fill_buffer_16byte", // Buffer fills; 16-byte aligned offset/length
- "fill_buffer_4byte", // Buffer fills; 4-byte aligned offset/length
- "fill_buffer_1byte", // Buffer fills; 1-byte aligned offset/length
+typedef struct iree_hal_metal_builtin_executable_data_t {
+ const char* entry_point;
+ uint32_t file_index;
+} iree_hal_metal_builtin_executable_data_t;
+
+// The list of builtin executable entry points and their source file index in builtin exectuable
+// embedded data. This MUST be consistent with kernel function names in MSL source code and the file
+// order in embedded data.
+static iree_hal_metal_builtin_executable_data_t iree_hal_metal_builtin_executable_entry_points[] = {
+ {"fill_buffer_16byte", 1}, // Buffer fills; 16-byte aligned offset/length
+ {"fill_buffer_4byte", 1}, // Buffer fills; 4-byte aligned offset/length
+ {"fill_buffer_1byte", 1}, // Buffer fills; 1-byte aligned offset/length
+ {"copy_buffer_1byte", 0}, // Buffer copies; 1-byte aligned offset/length
};
// The buffer fill specificiation. This MUST be consistent with the same struct in MSL source code.
@@ -27,6 +35,12 @@
uint32_t pattern; // 32-bit fill pattern
} iree_hal_metal_buffer_fill_spec_t;
+typedef struct iree_hal_metal_buffer_copy_spec_t {
+ uint64_t src_buffer_offset; // Source buffer offset (in bytes)
+ uint64_t dst_buffer_offset; // Destination buffer offset (in bytes)
+ uint64_t length; // Buffer length to fill (in bytes)
+} iree_hal_metal_buffer_copy_spec_t;
+
iree_status_t iree_hal_metal_builtin_executable_create(
id<MTLDevice> device, iree_allocator_t host_allocator,
iree_hal_metal_builtin_executable_t** out_executable) {
@@ -54,15 +68,17 @@
MTLCompileOptions* compile_options = [MTLCompileOptions new]; // +1
compile_options.languageVersion = MTLLanguageVersion3_0;
- const char* fill_buffer_source_data = fill_buffer_generic_create()[0].data;
- for (unsigned i = 0; i < IREE_ARRAYSIZE(iree_hal_metal_builtin_executable_entry_points); ++i) {
- const char* entry_point = iree_hal_metal_builtin_executable_entry_points[i];
+ for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(iree_hal_metal_builtin_executable_entry_points);
+ ++i) {
+ const char* entry_point = iree_hal_metal_builtin_executable_entry_points[i].entry_point;
+ uint32_t file_index = iree_hal_metal_builtin_executable_entry_points[i].file_index;
+ const char* source_data = metal_buffer_kernels_create()[file_index].data;
id<MTLLibrary> library = nil;
id<MTLFunction> function = nil;
id<MTLComputePipelineState> pso = nil;
- status = iree_hal_metal_compile_msl(fill_buffer_source_data, entry_point, device,
- compile_options, &library, &function, &pso);
+ status = iree_hal_metal_compile_msl(source_data, entry_point, device, compile_options,
+ &library, &function, &pso);
if (!iree_status_is_ok(status)) break;
// Package required parameters for kernel launches for each entry point.
@@ -166,3 +182,37 @@
threadsPerThreadgroup:MTLSizeMake(workgroup_size, 1, 1)];
return iree_ok_status();
}
+
+iree_status_t iree_hal_metal_builtin_executable_copy_buffer(
+ const iree_hal_metal_builtin_executable_t* executable, id<MTLComputeCommandEncoder> encoder,
+ id<MTLBuffer> source_buffer, iree_device_size_t source_offset, id<MTLBuffer> target_buffer,
+ iree_device_size_t target_offset, iree_device_size_t length) {
+ id<MTLComputePipelineState> pso = executable->entry_points[3].pso;
+ const iree_device_size_t workgroup_size = 32;
+ iree_device_size_t workgroup_count = iree_hal_metal_ceil_div(length, workgroup_size * 4);
+
+ iree_hal_metal_buffer_copy_spec_t spec = {
+ .src_buffer_offset = source_offset,
+ .dst_buffer_offset = target_offset,
+ .length = length,
+ };
+
+ [encoder setComputePipelineState:pso];
+
+ // The following MUST exactly match the pipeline layout from MSL source code.
+ // buffer(0) is the source buffer. Note that we MUST set 0 as offset here--the offset is to be
+ // handled directly in the kernels!
+ [encoder setBuffer:source_buffer offset:0 atIndex:0];
+ [encoder useResource:source_buffer usage:MTLResourceUsageRead];
+ // buffer(0) is the target buffer. Note that we MUST set 0 as offset here--the offset is to be
+ // handled directly in the kernels!
+ [encoder setBuffer:target_buffer offset:0 atIndex:1];
+ [encoder useResource:target_buffer usage:MTLResourceUsageWrite];
+ // buffer(1) is the buffer copy spec.
+ [encoder setBytes:&spec length:sizeof(spec) atIndex:2];
+
+ // Encode the dispatch.
+ [encoder dispatchThreadgroups:MTLSizeMake(workgroup_count, 1, 1)
+ threadsPerThreadgroup:MTLSizeMake(workgroup_size, 1, 1)];
+ return iree_ok_status();
+}
diff --git a/experimental/metal/direct_command_buffer.m b/experimental/metal/direct_command_buffer.m
index 7f9d81b..1ab3583 100644
--- a/experimental/metal/direct_command_buffer.m
+++ b/experimental/metal/direct_command_buffer.m
@@ -456,6 +456,37 @@
return status;
}
+static iree_status_t iree_hal_metal_command_buffer_copy_buffer_internal(
+ iree_hal_metal_command_buffer_t* command_buffer, id<MTLBuffer> source_device_buffer,
+ iree_device_size_t source_offset, id<MTLBuffer> target_device_buffer,
+ iree_device_size_t target_offset, iree_device_size_t length) {
+ // Per the spec for copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size, the source/target
+ // offset and length must be a multiple of 4 bytes in macOS, and 1 byte in iOS and tvOS.
+#if defined(IREE_PLATFORM_MACOS)
+ bool can_use_metal_api = source_offset % 4 == 0 && target_offset % 4 == 0 && length % 4 == 0;
+#else
+ bool can_use_metal_api = true;
+#endif
+
+ iree_status_t status = iree_ok_status();
+ if (can_use_metal_api) {
+ id<MTLBlitCommandEncoder> encoder = iree_hal_metal_get_or_begin_blit_encoder(command_buffer);
+ [encoder copyFromBuffer:source_device_buffer
+ sourceOffset:source_offset
+ toBuffer:target_device_buffer
+ destinationOffset:target_offset
+ size:length];
+ } else {
+ id<MTLComputeCommandEncoder> encoder =
+ iree_hal_metal_get_or_begin_compute_encoder(command_buffer);
+ status = iree_hal_metal_builtin_executable_copy_buffer(
+ command_buffer->builtin_executable, encoder, source_device_buffer, source_offset,
+ target_device_buffer, target_offset, length);
+ }
+
+ return status;
+}
+
static iree_status_t iree_hal_metal_command_buffer_update_buffer(
iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
@@ -466,16 +497,6 @@
iree_hal_metal_command_buffer_cast(base_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
- // Per the spec for copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size, the source/target
- // offset and length must be a multiple of 4 bytes in macOS, and 1 byte in iOS and tvOS.
-#if defined(IREE_PLATFORM_MACOS)
- if (source_offset % 4 != 0 || target_offset % 4 != 0 || length % 4 != 0) {
- IREE_TRACE_ZONE_END(z0);
- return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
- "unimplemented buffer update with non-4-multiple offset/length");
- }
-#endif
-
id<MTLDevice> device = command_buffer->command_buffer.device;
MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
id<MTLBuffer> data_buffer = [device newBufferWithBytes:((uint8_t*)source_buffer + source_offset)
@@ -492,15 +513,12 @@
iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(target_buffer));
target_offset += iree_hal_buffer_byte_offset(target_buffer);
- id<MTLBlitCommandEncoder> encoder = iree_hal_metal_get_or_begin_blit_encoder(command_buffer);
- [encoder copyFromBuffer:data_buffer
- sourceOffset:0
- toBuffer:target_device_buffer
- destinationOffset:target_offset
- size:length];
+ iree_status_t status = iree_hal_metal_command_buffer_copy_buffer_internal(
+ command_buffer, data_buffer, /*source_offset=*/0, target_device_buffer, target_offset,
+ length);
IREE_TRACE_ZONE_END(z0);
- return iree_ok_status();
+ return status;
}
static iree_status_t iree_hal_metal_command_buffer_copy_buffer(
@@ -511,16 +529,6 @@
iree_hal_metal_command_buffer_cast(base_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
- // Per the spec for copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size, the source/target
- // offset and length must be a multiple of 4 bytes in macOS, and 1 byte in iOS and tvOS.
-#if defined(IREE_PLATFORM_MACOS)
- if (source_offset % 4 != 0 || target_offset % 4 != 0 || length % 4 != 0) {
- IREE_TRACE_ZONE_END(z0);
- return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
- "unimplemented copy buffer with non-4-multiple offset/length");
- }
-#endif
-
const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
@@ -533,14 +541,12 @@
source_offset += iree_hal_buffer_byte_offset(source_buffer);
target_offset += iree_hal_buffer_byte_offset(target_buffer);
- id<MTLBlitCommandEncoder> encoder = iree_hal_metal_get_or_begin_blit_encoder(command_buffer);
- [encoder copyFromBuffer:source_device_buffer
- sourceOffset:source_offset
- toBuffer:target_device_buffer
- destinationOffset:target_offset
- size:length];
+ iree_status_t status = iree_hal_metal_command_buffer_copy_buffer_internal(
+ command_buffer, source_device_buffer, source_offset, target_device_buffer, target_offset,
+ length);
- return iree_ok_status();
+ IREE_TRACE_ZONE_END(z0);
+ return status;
}
static iree_status_t iree_hal_metal_command_buffer_collective(