[metal] Add builtin executable for copy unaligned buffers

For buffer copy on macOS, Metal require source/destination offset
and length to be some multiple of 4. So for unaligned cases, we
need a builtin executable to do the copy by ourselves.

This commit adds such a kernel. It copies 1 byte per thread, which
should work for Metal 3 devices. The performance won't likely be
good though; however, this is just a fallback and we can optimize
cases like fill buffer builtin executables if necessary.
diff --git a/experimental/metal/builtin/CMakeLists.txt b/experimental/metal/builtin/CMakeLists.txt
index e41cc83..5c0c0e1 100644
--- a/experimental/metal/builtin/CMakeLists.txt
+++ b/experimental/metal/builtin/CMakeLists.txt
@@ -10,13 +10,14 @@
   NAME
     builtin
   SRCS
+    "copy_buffer_generic.metal"
     "fill_buffer_generic.metal"
   C_FILE_OUTPUT
-    "fill_buffer_generic.c"
+    "metal_buffer_kernels.c"
   H_FILE_OUTPUT
-    "fill_buffer_generic.h"
+    "metal_buffer_kernels.h"
   IDENTIFIER
-    "fill_buffer_generic"
+    "metal_buffer_kernels"
   FLATTEN
   PUBLIC
 )
diff --git a/experimental/metal/builtin/copy_buffer_generic.metal b/experimental/metal/builtin/copy_buffer_generic.metal
new file mode 100644
index 0000000..ea66a5d
--- /dev/null
+++ b/experimental/metal/builtin/copy_buffer_generic.metal
@@ -0,0 +1,27 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Polyfill Metal kernels for buffer copies without 4-byte aligned offsets / lengths.
+
+struct CopySpec {
+  uint64_t src_buffer_offset;  // Source buffer offset (in bytes)
+  uint64_t dst_buffer_offset;  // Destination buffer offset (in bytes)
+  uint64_t length;             // Buffer length to fill (in bytes)
+};
+
+// Copies data from |src_buffer| to |dst_buffer| with the given |spec|ification.
+//
+// No alignment requirement on source/destination buffer offset and length.
+// Each thread copies over one byte. This won't be good for perf; so just a fallback.
+kernel void copy_buffer_1byte(
+  device uint8_t *src_buffer [[buffer(0)]],
+  device uint8_t *dst_buffer [[buffer(1)]],
+  constant CopySpec &spec [[buffer(2)]],
+  uint id [[thread_position_in_grid]]
+) {
+  if (id >= spec.length) return;
+  dst_buffer[spec.dst_buffer_offset + id] = src_buffer[spec.src_buffer_offset + id];
+}
diff --git a/experimental/metal/builtin_executables.h b/experimental/metal/builtin_executables.h
index 5e26325..5c9c07e 100644
--- a/experimental/metal/builtin_executables.h
+++ b/experimental/metal/builtin_executables.h
@@ -47,6 +47,18 @@
     iree_device_size_t target_offset, iree_device_size_t length,
     uint32_t pattern);
 
+// Copies the |source_buffer| at |source_offset| to the |target_buffer| at
+// |target_offset| of |length| using builtin executables dispatched via
+// |encoder|.
+//
+// Under the hood, this will record all necessary commands to bind kernel
+// objects and buffer resources, and the perform dispatch.
+iree_status_t iree_hal_metal_builtin_executable_copy_buffer(
+    const iree_hal_metal_builtin_executable_t* executable,
+    id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> source_buffer,
+    iree_device_size_t source_offset, id<MTLBuffer> target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/experimental/metal/builtin_executables.m b/experimental/metal/builtin_executables.m
index 1cc8000..287538e 100644
--- a/experimental/metal/builtin_executables.m
+++ b/experimental/metal/builtin_executables.m
@@ -7,17 +7,25 @@
 #include "experimental/metal/builtin_executables.h"
 
 #include <string.h>
-#include "experimental/metal/builtin/fill_buffer_generic.h"
+
+#include "experimental/metal/builtin/metal_buffer_kernels.h"
 #include "iree/base/api.h"
 #include "iree/base/tracing.h"
 #include "iree/hal/api.h"
 
-// The list of builtin executable entry points. This MUST be consistent with kernel function names
-// in MSL source code.
-static const char* iree_hal_metal_builtin_executable_entry_points[] = {
-    "fill_buffer_16byte",  // Buffer fills; 16-byte aligned offset/length
-    "fill_buffer_4byte",   // Buffer fills; 4-byte aligned offset/length
-    "fill_buffer_1byte",   // Buffer fills; 1-byte aligned offset/length
+typedef struct iree_hal_metal_builtin_executable_data_t {
+  const char* entry_point;
+  uint32_t file_index;
+} iree_hal_metal_builtin_executable_data_t;
+
+// The list of builtin executable entry points and their source file index in builtin exectuable
+// embedded data. This MUST be consistent with kernel function names in MSL source code and the file
+// order in embedded data.
+static iree_hal_metal_builtin_executable_data_t iree_hal_metal_builtin_executable_entry_points[] = {
+    {"fill_buffer_16byte", 1},  // Buffer fills; 16-byte aligned offset/length
+    {"fill_buffer_4byte", 1},   // Buffer fills; 4-byte aligned offset/length
+    {"fill_buffer_1byte", 1},   // Buffer fills; 1-byte aligned offset/length
+    {"copy_buffer_1byte", 0},   // Buffer copies; 1-byte aligned offset/length
 };
 
 // The buffer fill specificiation. This MUST be consistent with the same struct in MSL source code.
@@ -27,6 +35,12 @@
   uint32_t pattern;        // 32-bit fill pattern
 } iree_hal_metal_buffer_fill_spec_t;
 
+typedef struct iree_hal_metal_buffer_copy_spec_t {
+  uint64_t src_buffer_offset;  // Source buffer offset (in bytes)
+  uint64_t dst_buffer_offset;  // Destination buffer offset (in bytes)
+  uint64_t length;             // Buffer length to fill (in bytes)
+} iree_hal_metal_buffer_copy_spec_t;
+
 iree_status_t iree_hal_metal_builtin_executable_create(
     id<MTLDevice> device, iree_allocator_t host_allocator,
     iree_hal_metal_builtin_executable_t** out_executable) {
@@ -54,15 +68,17 @@
     MTLCompileOptions* compile_options = [MTLCompileOptions new];  // +1
     compile_options.languageVersion = MTLLanguageVersion3_0;
 
-    const char* fill_buffer_source_data = fill_buffer_generic_create()[0].data;
-    for (unsigned i = 0; i < IREE_ARRAYSIZE(iree_hal_metal_builtin_executable_entry_points); ++i) {
-      const char* entry_point = iree_hal_metal_builtin_executable_entry_points[i];
+    for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(iree_hal_metal_builtin_executable_entry_points);
+         ++i) {
+      const char* entry_point = iree_hal_metal_builtin_executable_entry_points[i].entry_point;
+      uint32_t file_index = iree_hal_metal_builtin_executable_entry_points[i].file_index;
+      const char* source_data = metal_buffer_kernels_create()[file_index].data;
 
       id<MTLLibrary> library = nil;
       id<MTLFunction> function = nil;
       id<MTLComputePipelineState> pso = nil;
-      status = iree_hal_metal_compile_msl(fill_buffer_source_data, entry_point, device,
-                                          compile_options, &library, &function, &pso);
+      status = iree_hal_metal_compile_msl(source_data, entry_point, device, compile_options,
+                                          &library, &function, &pso);
       if (!iree_status_is_ok(status)) break;
 
       // Package required parameters for kernel launches for each entry point.
@@ -166,3 +182,37 @@
           threadsPerThreadgroup:MTLSizeMake(workgroup_size, 1, 1)];
   return iree_ok_status();
 }
+
+iree_status_t iree_hal_metal_builtin_executable_copy_buffer(
+    const iree_hal_metal_builtin_executable_t* executable, id<MTLComputeCommandEncoder> encoder,
+    id<MTLBuffer> source_buffer, iree_device_size_t source_offset, id<MTLBuffer> target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  id<MTLComputePipelineState> pso = executable->entry_points[3].pso;
+  const iree_device_size_t workgroup_size = 32;
+  iree_device_size_t workgroup_count = iree_hal_metal_ceil_div(length, workgroup_size * 4);
+
+  iree_hal_metal_buffer_copy_spec_t spec = {
+      .src_buffer_offset = source_offset,
+      .dst_buffer_offset = target_offset,
+      .length = length,
+  };
+
+  [encoder setComputePipelineState:pso];
+
+  // The following MUST exactly match the pipeline layout from MSL source code.
+  // buffer(0) is the source buffer. Note that we MUST set 0 as offset here--the offset is to be
+  // handled directly in the kernels!
+  [encoder setBuffer:source_buffer offset:0 atIndex:0];
+  [encoder useResource:source_buffer usage:MTLResourceUsageRead];
+  // buffer(0) is the target buffer. Note that we MUST set 0 as offset here--the offset is to be
+  // handled directly in the kernels!
+  [encoder setBuffer:target_buffer offset:0 atIndex:1];
+  [encoder useResource:target_buffer usage:MTLResourceUsageWrite];
+  // buffer(1) is the buffer copy spec.
+  [encoder setBytes:&spec length:sizeof(spec) atIndex:2];
+
+  // Encode the dispatch.
+  [encoder dispatchThreadgroups:MTLSizeMake(workgroup_count, 1, 1)
+          threadsPerThreadgroup:MTLSizeMake(workgroup_size, 1, 1)];
+  return iree_ok_status();
+}
diff --git a/experimental/metal/direct_command_buffer.m b/experimental/metal/direct_command_buffer.m
index 7f9d81b..1ab3583 100644
--- a/experimental/metal/direct_command_buffer.m
+++ b/experimental/metal/direct_command_buffer.m
@@ -456,6 +456,37 @@
   return status;
 }
 
+static iree_status_t iree_hal_metal_command_buffer_copy_buffer_internal(
+    iree_hal_metal_command_buffer_t* command_buffer, id<MTLBuffer> source_device_buffer,
+    iree_device_size_t source_offset, id<MTLBuffer> target_device_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  // Per the spec for copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size, the source/target
+  // offset and length must be a multiple of 4 bytes in macOS, and 1 byte in iOS and tvOS.
+#if defined(IREE_PLATFORM_MACOS)
+  bool can_use_metal_api = source_offset % 4 == 0 && target_offset % 4 == 0 && length % 4 == 0;
+#else
+  bool can_use_metal_api = true;
+#endif
+
+  iree_status_t status = iree_ok_status();
+  if (can_use_metal_api) {
+    id<MTLBlitCommandEncoder> encoder = iree_hal_metal_get_or_begin_blit_encoder(command_buffer);
+    [encoder copyFromBuffer:source_device_buffer
+               sourceOffset:source_offset
+                   toBuffer:target_device_buffer
+          destinationOffset:target_offset
+                       size:length];
+  } else {
+    id<MTLComputeCommandEncoder> encoder =
+        iree_hal_metal_get_or_begin_compute_encoder(command_buffer);
+    status = iree_hal_metal_builtin_executable_copy_buffer(
+        command_buffer->builtin_executable, encoder, source_device_buffer, source_offset,
+        target_device_buffer, target_offset, length);
+  }
+
+  return status;
+}
+
 static iree_status_t iree_hal_metal_command_buffer_update_buffer(
     iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
     iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
@@ -466,16 +497,6 @@
       iree_hal_metal_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  // Per the spec for copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size, the source/target
-  // offset and length must be a multiple of 4 bytes in macOS, and 1 byte in iOS and tvOS.
-#if defined(IREE_PLATFORM_MACOS)
-  if (source_offset % 4 != 0 || target_offset % 4 != 0 || length % 4 != 0) {
-    IREE_TRACE_ZONE_END(z0);
-    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                            "unimplemented buffer update with non-4-multiple offset/length");
-  }
-#endif
-
   id<MTLDevice> device = command_buffer->command_buffer.device;
   MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
   id<MTLBuffer> data_buffer = [device newBufferWithBytes:((uint8_t*)source_buffer + source_offset)
@@ -492,15 +513,12 @@
       iree_hal_metal_buffer_handle(iree_hal_buffer_allocated_buffer(target_buffer));
   target_offset += iree_hal_buffer_byte_offset(target_buffer);
 
-  id<MTLBlitCommandEncoder> encoder = iree_hal_metal_get_or_begin_blit_encoder(command_buffer);
-  [encoder copyFromBuffer:data_buffer
-             sourceOffset:0
-                 toBuffer:target_device_buffer
-        destinationOffset:target_offset
-                     size:length];
+  iree_status_t status = iree_hal_metal_command_buffer_copy_buffer_internal(
+      command_buffer, data_buffer, /*source_offset=*/0, target_device_buffer, target_offset,
+      length);
 
   IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
+  return status;
 }
 
 static iree_status_t iree_hal_metal_command_buffer_copy_buffer(
@@ -511,16 +529,6 @@
       iree_hal_metal_command_buffer_cast(base_command_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  // Per the spec for copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size, the source/target
-  // offset and length must be a multiple of 4 bytes in macOS, and 1 byte in iOS and tvOS.
-#if defined(IREE_PLATFORM_MACOS)
-  if (source_offset % 4 != 0 || target_offset % 4 != 0 || length % 4 != 0) {
-    IREE_TRACE_ZONE_END(z0);
-    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                            "unimplemented copy buffer with non-4-multiple offset/length");
-  }
-#endif
-
   const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
@@ -533,14 +541,12 @@
   source_offset += iree_hal_buffer_byte_offset(source_buffer);
   target_offset += iree_hal_buffer_byte_offset(target_buffer);
 
-  id<MTLBlitCommandEncoder> encoder = iree_hal_metal_get_or_begin_blit_encoder(command_buffer);
-  [encoder copyFromBuffer:source_device_buffer
-             sourceOffset:source_offset
-                 toBuffer:target_device_buffer
-        destinationOffset:target_offset
-                     size:length];
+  iree_status_t status = iree_hal_metal_command_buffer_copy_buffer_internal(
+      command_buffer, source_device_buffer, source_offset, target_device_buffer, target_offset,
+      length);
 
-  return iree_ok_status();
+  IREE_TRACE_ZONE_END(z0);
+  return status;
 }
 
 static iree_status_t iree_hal_metal_command_buffer_collective(