[metal] Keep track of queue in buffer construction for macOS

For managed storage mode, we need to issue GPU commands to
explicitly make data visible to CPU. This would require
having a handle to a MTLCommandQueue. Instead of creating
a new one internally, give the queue to the buffer during
construction time.
diff --git a/experimental/metal/direct_allocator.h b/experimental/metal/direct_allocator.h
index 5e3621f..ae39a18 100644
--- a/experimental/metal/direct_allocator.h
+++ b/experimental/metal/direct_allocator.h
@@ -17,19 +17,28 @@
 extern "C" {
 #endif  // __cplusplus
 
-// Create a straightforward Metal allocator from the given |base_device| that
+// Creates a straightforward Metal allocator from the given |device| that
 // performs allocations separately without caching or suballocation.
 //
+// On macOS, we additionally need the command queue to encode commands to make
+// buffer contents visible to the CPU for managed storage type.
+//
 // |out_allocator| must be released by the caller (see
 // iree_hal_allocator_release).
 iree_status_t iree_hal_metal_allocator_create(
-    iree_hal_device_t* base_device, id<MTLDevice> device,
+    id<MTLDevice> device,
+#if defined(IREE_PLATFORM_MACOS)
+    id<MTLCommandQueue> queue,
+#endif  // IREE_PLATFORM_MACOS
     iree_hal_metal_resource_hazard_tracking_mode_t resource_tracking_mode,
     iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator);
 
-// Returns the underyling HAL device associated with the given |allocator|.
-const iree_hal_device_t* iree_hal_metal_allocator_device(
+#if defined(IREE_PLATFORM_MACOS)
+// Returns the underyling MetalCommandQueue associated with the given
+// |allocator|.
+id<MTLCommandQueue> iree_hal_metal_allocator_command_queue(
     const iree_hal_allocator_t* allocator);
+#endif  // IREE_PLATFORM_MACOS
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/experimental/metal/direct_allocator.m b/experimental/metal/direct_allocator.m
index 84b6137..9c58a4d 100644
--- a/experimental/metal/direct_allocator.m
+++ b/experimental/metal/direct_allocator.m
@@ -23,8 +23,11 @@
   iree_hal_resource_t resource;
 
   // The device that this allocator is attached to.
-  iree_hal_device_t* base_device;
   id<MTLDevice> device;
+  // The command queue that we can use to issue commands to make buffer contents visible to CPU.
+#if defined(IREE_PLATFORM_MACOS)
+  id<MTLCommandQueue> queue;
+#endif  // IREE_PLATFORM_MACOS
 
   bool is_unified_memory;
   iree_hal_metal_resource_hazard_tracking_mode_t resource_tracking_mode;
@@ -48,10 +51,12 @@
 }
 
 iree_status_t iree_hal_metal_allocator_create(
-    iree_hal_device_t* base_device, id<MTLDevice> device,
+    id<MTLDevice> device,
+#if defined(IREE_PLATFORM_MACOS)
+    id<MTLCommandQueue> queue,
+#endif  // IREE_PLATFORM_MACOS
     iree_hal_metal_resource_hazard_tracking_mode_t resource_tracking_mode,
     iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) {
-  IREE_ASSERT_ARGUMENT(base_device);
   IREE_ASSERT_ARGUMENT(out_allocator);
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -61,8 +66,6 @@
 
   if (iree_status_is_ok(status)) {
     iree_hal_resource_initialize(&iree_hal_metal_allocator_vtable, &allocator->resource);
-    allocator->base_device = base_device;
-    iree_hal_device_retain(base_device);
     allocator->device = [device retain];  // +1
     allocator->is_unified_memory = [device hasUnifiedMemory];
     allocator->resource_tracking_mode = resource_tracking_mode;
@@ -81,7 +84,6 @@
   IREE_TRACE_ZONE_BEGIN(z0);
 
   [allocator->device release];  // -1
-  iree_hal_device_release(allocator->base_device);
   iree_allocator_free(host_allocator, allocator);
 
   IREE_TRACE_ZONE_END(z0);
@@ -93,11 +95,13 @@
   return allocator->host_allocator;
 }
 
-const iree_hal_device_t* iree_hal_metal_allocator_device(
+#if defined(IREE_PLATFORM_MACOS)
+id<MTLCommandQueue> iree_hal_metal_allocator_command_queue(
     const iree_hal_allocator_t* base_allocator) {
   const iree_hal_metal_allocator_t* allocator = (const iree_hal_metal_allocator_t*)base_allocator;
-  return allocator->base_device;
+  return allocator->queue;
 }
+#endif  // IREE_PLATFORM_MACOS
 
 static iree_status_t iree_hal_metal_allocator_trim(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
@@ -285,6 +289,9 @@
   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_metal_buffer_wrap(
+#if defined(IREE_PLATFORM_MACOS)
+        allocator->queue,
+#endif  // IREE_PLATFORM_MACOS
         metal_buffer, base_allocator, compat_params.type, compat_params.access, compat_params.usage,
         allocation_size, /*byte_offset=*/0,
         /*byte_length=*/allocation_size, iree_hal_buffer_release_callback_null(), &buffer);  // +1
diff --git a/experimental/metal/metal_buffer.h b/experimental/metal/metal_buffer.h
index 01145c4..b7f0b65 100644
--- a/experimental/metal/metal_buffer.h
+++ b/experimental/metal/metal_buffer.h
@@ -20,6 +20,9 @@
 //
 // |out_buffer| must be released by the caller (see iree_hal_buffer_release).
 iree_status_t iree_hal_metal_buffer_wrap(
+#if defined(IREE_PLATFORM_MACOS)
+    id<MTLCommandQueue> queue,
+#endif  // IREE_PLATFORM_MACOS
     id<MTLBuffer> metal_buffer, iree_hal_allocator_t* allocator,
     iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access,
     iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
diff --git a/experimental/metal/metal_buffer.m b/experimental/metal/metal_buffer.m
index 8e927a1..5fa909e 100644
--- a/experimental/metal/metal_buffer.m
+++ b/experimental/metal/metal_buffer.m
@@ -18,6 +18,10 @@
 typedef struct iree_hal_metal_buffer_t {
   iree_hal_buffer_t base;
   id<MTLBuffer> buffer;
+  // The command queue that we can use to issue commands to make buffer contents visible to CPU.
+#if defined(IREE_PLATFORM_MACOS)
+  id<MTLCommandQueue> queue;
+#endif  // IREE_PLATFORM_MACOS
   iree_hal_buffer_release_callback_t release_callback;
 } iree_hal_metal_buffer_t;
 
@@ -35,6 +39,9 @@
 }
 
 iree_status_t iree_hal_metal_buffer_wrap(
+#if defined(IREE_PLATFORM_MACOS)
+    id<MTLCommandQueue> queue,
+#endif  // IREE_PLATFORM_MACOS
     id<MTLBuffer> metal_buffer, iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
     iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage,
     iree_device_size_t allocation_size, iree_device_size_t byte_offset,
@@ -52,6 +59,9 @@
                                byte_offset, byte_length, memory_type, allowed_access, allowed_usage,
                                &iree_hal_metal_buffer_vtable, &buffer->base);
     buffer->buffer = [metal_buffer retain];  // +1
+#if defined(IREE_PLATFORM_MACOS)
+    buffer->queue = queue;
+#endif  // IREE_PLATFORM_MACOS
     buffer->release_callback = release_callback;
     *out_buffer = &buffer->base;
   }
@@ -90,10 +100,7 @@
   // and commit to the queue.
   iree_hal_metal_buffer_t* buffer = iree_hal_metal_buffer_cast(base_buffer);
   if (buffer->buffer.storageMode == MTLStorageModeManaged) {
-    const iree_hal_device_t* device =
-        iree_hal_metal_allocator_device(buffer->base.device_allocator);
-    id<MTLCommandQueue> queue = iree_hal_metal_device_command_queue(device);
-    id<MTLCommandBuffer> command_buffer = [queue commandBuffer];
+    id<MTLCommandBuffer> command_buffer = [buffer->queue commandBuffer];
 
     id<MTLBlitCommandEncoder> blitCommandEncoder = [command_buffer blitCommandEncoder];
     [blitCommandEncoder synchronizeResource:buffer->buffer];
diff --git a/experimental/metal/metal_device.h b/experimental/metal/metal_device.h
index 656fd09..3348913 100644
--- a/experimental/metal/metal_device.h
+++ b/experimental/metal/metal_device.h
@@ -30,11 +30,6 @@
 const iree_hal_metal_device_params_t* iree_hal_metal_device_params(
     const iree_hal_device_t* device);
 
-// Returns the Metal command queue associated with the given |device|.
-// Note that right now we only support one command queue per Metal device.
-id<MTLCommandQueue> iree_hal_metal_device_command_queue(
-    const iree_hal_device_t* device);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/experimental/metal/metal_device.m b/experimental/metal/metal_device.m
index bcbeb37..191bd25 100644
--- a/experimental/metal/metal_device.m
+++ b/experimental/metal/metal_device.m
@@ -86,11 +86,6 @@
   return &device->params;
 }
 
-id<MTLCommandQueue> iree_hal_metal_device_command_queue(const iree_hal_device_t* base_device) {
-  const iree_hal_metal_device_t* device = iree_hal_metal_device_const_cast(base_device);
-  return device->queue;
-}
-
 static iree_status_t iree_hal_metal_device_create_internal(
     iree_string_view_t identifier, const iree_hal_metal_device_params_t* params,
     id<MTLDevice> metal_device, iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
@@ -99,7 +94,12 @@
   iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size;
   IREE_RETURN_IF_ERROR(iree_allocator_malloc(host_allocator, total_size, (void**)&device));
 
-  iree_status_t status = iree_hal_metal_allocator_create((iree_hal_device_t*)device, metal_device,
+  id<MTLCommandQueue> metal_queue = [metal_device newCommandQueue];  // +1
+
+  iree_status_t status = iree_hal_metal_allocator_create(metal_device,
+#if defined(IREE_PLATFORM_MACOS)
+                                                         metal_queue,
+#endif  // IREE_PLATFORM_MACOS
                                                          params->resource_hazard_tracking_mode,
                                                          host_allocator, &device->device_allocator);
 
@@ -125,8 +125,8 @@
     iree_arena_block_pool_initialize(params->arena_block_size, host_allocator, &device->block_pool);
     device->params = *params;
     device->host_allocator = host_allocator;
-    device->device = [metal_device retain];          // +1
-    device->queue = [metal_device newCommandQueue];  // +1
+    device->device = [metal_device retain];  // +1
+    device->queue = metal_queue;
     device->command_buffer_resource_reference_mode = params->command_buffer_resource_reference_mode;
     device->builtin_executable = builtin_executable;
     dispatch_queue_attr_t queue_attr = dispatch_queue_attr_make_with_qos_class(