[metal] Retain device until command buffer completion (#15288)
We use resource sets in `iree_hal_metal_device_queue_execute` to keep
track of command buffers and semaphores and release them when the
command buffers complete. It would require the underlying block pool
backing the resource sets to have a larger lifetime to make sure we
don't access destroyed data structures. So we need to retain the device
until the command buffer completes.
Fixes https://github.com/openxla/iree/issues/14814
diff --git a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
index d3b52c6..3951c48 100644
--- a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
+++ b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
@@ -398,7 +398,7 @@
iree_hal_metal_staging_buffer_increase_command_buffer_refcount(staging_buffer);
// Retain the device given that we refer to builtin executables and staging buffers whose
// lifetime is associated with the device.
- iree_hal_resource_retain(device);
+ iree_hal_device_retain(device);
} else {
iree_hal_metal_command_buffer_destroy_internal(&command_buffer->base);
}
@@ -437,7 +437,7 @@
iree_hal_metal_command_buffer_destroy_internal(base_command_buffer);
- iree_hal_resource_release(device);
+ iree_hal_device_release(device);
IREE_TRACE_ZONE_END(z0);
}
diff --git a/runtime/src/iree/hal/drivers/metal/metal_device.m b/runtime/src/iree/hal/drivers/metal/metal_device.m
index 6cc23ef..8199017 100644
--- a/runtime/src/iree/hal/drivers/metal/metal_device.m
+++ b/runtime/src/iree/hal/drivers/metal/metal_device.m
@@ -446,9 +446,16 @@
value:signal_semaphore_list.payload_values[i]];
}
+ // We use a resource set to keep track of resources in the above. So here we need to retain
+ // the device to make sure the block pool behind outlives the resource set.
+ iree_hal_device_retain(base_device);
[signal_command_buffer addCompletedHandler:^(id<MTLCommandBuffer> cb) {
// Now we can release all retained resources.
iree_hal_resource_set_free(resource_set);
+ // And then release the device handle. Note that this must happen separately--if we put the
+ // device itself in the resource set, we can destroy the block pool data structure inside
+ // the device prematurely, before the resource set free procedure done scanning it.
+ iree_hal_device_release(base_device);
}];
[signal_command_buffer commit];
}