blob: d93b2599efb013dee99f54ec6af9dc0fe2b47a9c [file] [log] [blame]
// Copyright 2019 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/hal/drivers/vulkan/vulkan_device.h"
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <vector>
#include "iree/base/internal/arena.h"
#include "iree/base/internal/math.h"
#include "iree/hal/drivers/vulkan/api.h"
#include "iree/hal/drivers/vulkan/builtin_executables.h"
#include "iree/hal/drivers/vulkan/command_queue.h"
#include "iree/hal/drivers/vulkan/descriptor_pool_cache.h"
#include "iree/hal/drivers/vulkan/direct_command_buffer.h"
#include "iree/hal/drivers/vulkan/direct_command_queue.h"
#include "iree/hal/drivers/vulkan/dynamic_symbols.h"
#include "iree/hal/drivers/vulkan/extensibility_util.h"
#include "iree/hal/drivers/vulkan/handle_util.h"
#include "iree/hal/drivers/vulkan/native_allocator.h"
#include "iree/hal/drivers/vulkan/native_event.h"
#include "iree/hal/drivers/vulkan/native_pipeline_layout.h"
#include "iree/hal/drivers/vulkan/native_semaphore.h"
#include "iree/hal/drivers/vulkan/nop_executable_cache.h"
#include "iree/hal/drivers/vulkan/status_util.h"
#include "iree/hal/drivers/vulkan/tracing.h"
#include "iree/hal/drivers/vulkan/util/arena.h"
#include "iree/hal/drivers/vulkan/util/ref_ptr.h"
#include "iree/hal/drivers/vulkan/vma_allocator.h"
#include "iree/hal/utils/buffer_transfer.h"
#include "iree/hal/utils/file_transfer.h"
#include "iree/hal/utils/memory_file.h"
using namespace iree::hal::vulkan;
//===----------------------------------------------------------------------===//
// RenderDoc integration
//===----------------------------------------------------------------------===//
// Configure cmake with -DIREE_ENABLE_RENDERDOC_PROFILING=ON in order to
// enable profiling support. This should be left off in production builds to
// avoid introducing a backdoor.
#if defined(IREE_HAL_VULKAN_HAVE_RENDERDOC)
#if !defined(IREE_PLATFORM_WINDOWS)
#include <dlfcn.h>
#endif // IREE_PLATFORM_WINDOWS
// NOTE: C API, see https://renderdoc.org/docs/in_application_api.html.
// When compiled in the API will no-op itself if not running under a RenderDoc
// capture context (renderdoc.dll/so already loaded).
#include "third_party/renderdoc/renderdoc_app.h"
typedef RENDERDOC_API_1_5_0 RENDERDOC_API_LATEST;
// Returns a handle to the RenderDoc API when it is hooking the process.
// Returns NULL when RenderDoc is not present (or valid).
static RENDERDOC_API_LATEST* iree_hal_vulkan_query_renderdoc_api(
VkInstance instance) {
pRENDERDOC_GetAPI RENDERDOC_GetAPI = NULL;
#if defined(IREE_PLATFORM_WINDOWS)
// NOTE: RenderDoc only supports hooking so we can't use LoadLibrary - if
// we're going to use RenderDoc its library must already be loaded.
if (HMODULE hook_module = GetModuleHandleA("renderdoc.dll")) {
RENDERDOC_GetAPI =
(pRENDERDOC_GetAPI)GetProcAddress(hook_module, "RENDERDOC_GetAPI");
}
#else
// dlopen/dlsym on posix-like systems. Note that each platform has its own
// naming for the injected module. Because RenderDoc only supports hooking
// (where the hosting process loads the library in magic ways for us) we use
// RTLD_NOLOAD to ensure we don't accidentally try to load it when not hooked.
void* hook_module = NULL;
#if defined(IREE_PLATFORM_ANDROID)
hook_module = dlopen("libVkLayer_GLES_RenderDoc.so", RTLD_NOW | RTLD_NOLOAD);
#elif defined(IREE_PLATFORM_APPLE)
hook_module = dlopen("librenderdoc.dylib", RTLD_NOW | RTLD_NOLOAD);
#elif defined(IREE_PLATFORM_LINUX)
hook_module = dlopen("librenderdoc.so", RTLD_NOW | RTLD_NOLOAD);
#else
#error "RenderDoc profiling not supported on this platform"
#endif // IREE_PLATFORM_*
if (hook_module) {
RENDERDOC_GetAPI =
(pRENDERDOC_GetAPI)dlsym(hook_module, "RENDERDOC_GetAPI");
}
#endif // IREE_PLATFORM_WINDOWS
if (!RENDERDOC_GetAPI) return NULL; // not found, no-op
RENDERDOC_API_LATEST* api = NULL;
int query_result =
RENDERDOC_GetAPI(eRENDERDOC_API_Version_1_5_0, (void**)&api);
if (query_result != 1) {
// Failed to initialize API (old version, etc). No-op.
return NULL;
}
return api;
}
// Begins a new RenderDoc capture.
static void iree_hal_vulkan_begin_renderdoc_capture(
RENDERDOC_API_LATEST* renderdoc_api, VkInstance instance,
const iree_hal_device_profiling_options_t* options) {
if (!renderdoc_api) return;
if (options->file_path) {
renderdoc_api->SetCaptureFilePathTemplate(options->file_path);
}
renderdoc_api->StartFrameCapture(
RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(instance), NULL);
}
// Ends the active RenderDoc capture, if any active.
static void iree_hal_vulkan_end_renderdoc_capture(
RENDERDOC_API_LATEST* renderdoc_api, VkInstance instance) {
if (!renderdoc_api) return;
if (renderdoc_api->IsFrameCapturing()) {
renderdoc_api->EndFrameCapture(
RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(instance), NULL);
}
}
#endif // IREE_HAL_VULKAN_HAVE_RENDERDOC
//===----------------------------------------------------------------------===//
// iree_hal_vulkan_device_t extensibility util
//===----------------------------------------------------------------------===//
IREE_API_EXPORT iree_status_t iree_hal_vulkan_query_extensibility_set(
iree_hal_vulkan_features_t requested_features,
iree_hal_vulkan_extensibility_set_t set, iree_host_size_t string_capacity,
iree_host_size_t* out_string_count, const char** out_string_values) {
*out_string_count = 0;
iree_status_t status = iree_ok_status();
iree_host_size_t string_count = 0;
#define ADD_EXT(target_set, name_literal) \
if (iree_status_is_ok(status) && set == (target_set)) { \
if (string_count >= string_capacity && out_string_values) { \
status = iree_status_from_code(IREE_STATUS_OUT_OF_RANGE); \
} else if (out_string_values) { \
out_string_values[string_count] = (name_literal); \
} \
++string_count; \
}
//===--------------------------------------------------------------------===//
// Baseline IREE requirements
//===--------------------------------------------------------------------===//
// Using IREE at all requires these extensions unconditionally. Adding things
// here changes our minimum requirements and should be done carefully.
// Optional extensions here are feature detected by the runtime.
#if defined(IREE_PLATFORM_APPLE)
// VK_KHR_portability_subset:
// For Apple platforms, Vulkan is layered on top of Metal via MoltenVK.
// It exposes this extension to allow a non-conformant Vulkan implementation
// to be built on top of another non-Vulkan graphics API. This extension must
// be enabled if exists.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME);
// VK_KHR_portability_enumeration:
// Further, since devices which support the VK_KHR_portability_subset
// extension are not fully conformant Vulkan implementations, the Vulkan
// loader does not report those devices unless the application explicitly
// asks for them.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_REQUIRED,
VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME);
#endif
// VK_KHR_storage_buffer_storage_class:
// Our generated SPIR-V kernels use storage buffers for all their data access.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME);
// VK_KHR_get_physical_device_properties2:
// Multiple extensions depend on VK_KHR_get_physical_device_properties2.
// This extension was deprecated in Vulkan 1.1 as its functionality was
// promoted to core so we list it as optional even though we require it.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
// VK_KHR_push_descriptor:
// We can avoid a lot of additional Vulkan descriptor set manipulation
// overhead when this extension is present. Android is a holdout, though, and
// we have a fallback for when it's not available.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME);
// VK_KHR_timeline_semaphore:
// Required as IREE's primary synchronization primitive, but the extension
// was promoted to core in Vulkan 1.2.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
// VK_KHR_external_memory:
// Promoted to core in Vulkan 1.1 and not required but here just in case
// tooling wants to see the request.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
// VK_EXT_external_memory_host:
// Optional to enable import/export of host pointers.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME);
// VK_KHR_buffer_device_address:
// Promoted to core in Vulkan 1.2 but still an extension in 1.1.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME);
//===--------------------------------------------------------------------===//
// Vulkan forward-compatibility shims
//===--------------------------------------------------------------------===//
// These are shims or extensions that are made core later in the spec and can
// be removed once we require the core version that contains them.
// VK_LAYER_KHRONOS_timeline_semaphore:
// polyfill layer - enable if present. Ignored if timeline semaphores are
// supported natively (Vulkan 1.2+).
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL,
"VK_LAYER_KHRONOS_timeline_semaphore");
//===--------------------------------------------------------------------===//
// Optional CodeGen features
//===--------------------------------------------------------------------===//
// VK_EXT_subgroup_size_control:
// This extensions allows us to control the subgroup size used by Vulkan
// implementations, which can boost performance. It's promoted to core
// since Vulkan v1.3.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME);
//===--------------------------------------------------------------------===//
// Optional debugging features
//===--------------------------------------------------------------------===//
// Used only when explicitly requested as they drastically change the
// performance behavior of Vulkan.
// VK_LAYER_KHRONOS_validation:
// only enabled if validation is desired. Since validation in Vulkan is just a
// API correctness check it can't be used as a security mechanism and is fine
// to ignore.
if (iree_all_bits_set(requested_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS)) {
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL,
"VK_LAYER_KHRONOS_validation");
}
// VK_EXT_debug_utils:
// only enabled if debugging is desired to route Vulkan debug messages through
// our logging sinks. Note that this adds a non-trivial runtime overhead and
// we may want to disable it even in debug builds.
if (iree_all_bits_set(requested_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS)) {
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
}
#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
if (iree_all_bits_set(requested_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) {
// VK_EXT_host_query_reset:
// optionally allows for vkResetQueryPool to be used to reset query pools
// from the host without needing to do an expensive vkCmdResetQueryPool
// submission.
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME);
// VK_EXT_calibrated_timestamps:
// optionally provides more accurate timestamps that correspond to the
// system time. If this is not present then tracy will attempt calibration
// itself and have some per-run variance in the skew (up to many
// milliseconds).
ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME);
}
#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
*out_string_count = string_count;
return status;
}
//===----------------------------------------------------------------------===//
// Queue selection
//===----------------------------------------------------------------------===//
#define IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX (-1)
typedef struct iree_hal_vulkan_queue_family_info_t {
uint32_t dispatch_index;
iree_host_size_t dispatch_queue_count;
uint32_t transfer_index;
iree_host_size_t transfer_queue_count;
} iree_hal_vulkan_queue_family_info_t;
// Finds the first queue in the listing (which is usually the
// driver-preferred) that has all of the |required_queue_flags| and none of
// the |excluded_queue_flags|.
// Returns IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX if no matching queue is
// found.
static uint32_t iree_hal_vulkan_find_first_queue_family_with_flags(
uint32_t queue_family_count,
const VkQueueFamilyProperties* queue_family_properties,
VkQueueFlags required_queue_flags, VkQueueFlags excluded_queue_flags) {
for (uint32_t queue_family_index = 0; queue_family_index < queue_family_count;
++queue_family_index) {
const VkQueueFamilyProperties* properties =
&queue_family_properties[queue_family_index];
if (iree_all_bits_set(properties->queueFlags, required_queue_flags) &&
!iree_any_bit_set(properties->queueFlags, excluded_queue_flags)) {
return queue_family_index;
}
}
return IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX;
}
// Selects queue family indices for compute and transfer queues.
// Note that both queue families may be the same if there is only one family
// available.
static iree_status_t iree_hal_vulkan_select_queue_families(
const iree_hal_vulkan_device_options_t* options,
VkPhysicalDevice physical_device, iree::hal::vulkan::DynamicSymbols* syms,
iree_hal_vulkan_queue_family_info_t* out_family_info) {
// Enumerate queue families available on the device.
uint32_t queue_family_count = 0;
syms->vkGetPhysicalDeviceQueueFamilyProperties(physical_device,
&queue_family_count, NULL);
VkQueueFamilyProperties* queue_family_properties =
(VkQueueFamilyProperties*)iree_alloca(queue_family_count *
sizeof(VkQueueFamilyProperties));
syms->vkGetPhysicalDeviceQueueFamilyProperties(
physical_device, &queue_family_count, queue_family_properties);
memset(out_family_info, 0, sizeof(*out_family_info));
out_family_info->dispatch_index = IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX;
out_family_info->dispatch_queue_count = 0;
out_family_info->transfer_index = IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX;
out_family_info->transfer_queue_count = 0;
// By default we choose graphics+compute as on most current GPUs this is a
// primary queue and may run at the fastest clock speed.
// If the user is integrating into applications with existing graphics
// workloads then they can request that we instead try to find a dedicated
// compute-only queue such that we can run async with the rest of their
// existing workload.
if (iree_all_bits_set(options->flags,
IREE_HAL_VULKAN_DEVICE_FLAG_DEDICATED_COMPUTE_QUEUE)) {
// Try to find a dedicated compute queue. If this fails then we'll fall back
// to any queue supporting compute.
out_family_info->dispatch_index =
iree_hal_vulkan_find_first_queue_family_with_flags(
queue_family_count, queue_family_properties, VK_QUEUE_COMPUTE_BIT,
VK_QUEUE_GRAPHICS_BIT);
}
if (out_family_info->dispatch_index ==
IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
out_family_info->dispatch_index =
iree_hal_vulkan_find_first_queue_family_with_flags(
queue_family_count, queue_family_properties, VK_QUEUE_COMPUTE_BIT,
0);
}
if (out_family_info->dispatch_index ==
IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
return iree_make_status(
IREE_STATUS_NOT_FOUND,
"unable to find any queue family support compute operations");
}
out_family_info->dispatch_queue_count =
queue_family_properties[out_family_info->dispatch_index].queueCount;
// Try to find a dedicated transfer queue (no compute or graphics caps).
// Not all devices have one, and some have only a queue family for
// everything and possibly a queue family just for compute/etc. If that
// fails then fallback to any queue that supports transfer. Finally, if
// /that/ fails then we just won't create a transfer queue and instead use
// the compute queue for all operations.
out_family_info->transfer_index =
iree_hal_vulkan_find_first_queue_family_with_flags(
queue_family_count, queue_family_properties, VK_QUEUE_TRANSFER_BIT,
VK_QUEUE_COMPUTE_BIT | VK_QUEUE_GRAPHICS_BIT);
if (out_family_info->transfer_index ==
IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
out_family_info->transfer_index =
iree_hal_vulkan_find_first_queue_family_with_flags(
queue_family_count, queue_family_properties, VK_QUEUE_TRANSFER_BIT,
VK_QUEUE_GRAPHICS_BIT);
}
if (out_family_info->transfer_index ==
IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
out_family_info->transfer_index =
iree_hal_vulkan_find_first_queue_family_with_flags(
queue_family_count, queue_family_properties, VK_QUEUE_TRANSFER_BIT,
0);
}
if (out_family_info->transfer_index !=
IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
out_family_info->transfer_queue_count =
queue_family_properties[out_family_info->transfer_index].queueCount;
}
// Ensure that we don't share the dispatch queues with transfer queues if
// that would put us over the queue count.
if (out_family_info->dispatch_index == out_family_info->transfer_index) {
out_family_info->transfer_queue_count = iree_min(
queue_family_properties[out_family_info->dispatch_index].queueCount -
out_family_info->dispatch_queue_count,
out_family_info->transfer_queue_count);
}
// Limit the number of queues we create (for now).
// We may want to allow this to grow, but each queue adds overhead and we
// need to measure to make sure we can effectively use them all.
out_family_info->dispatch_queue_count =
iree_min(2u, out_family_info->dispatch_queue_count);
out_family_info->transfer_queue_count =
iree_min(1u, out_family_info->transfer_queue_count);
return iree_ok_status();
}
// Builds a set of compute and transfer queues based on the queues available on
// the device and some magic heuristical goo.
static iree_status_t iree_hal_vulkan_build_queue_sets(
const iree_hal_vulkan_device_options_t* options,
VkPhysicalDevice physical_device, iree::hal::vulkan::DynamicSymbols* syms,
iree_hal_vulkan_queue_set_t* out_compute_queue_set,
iree_hal_vulkan_queue_set_t* out_transfer_queue_set) {
// Select which queues to use (and fail the implementation can't handle them).
iree_hal_vulkan_queue_family_info_t queue_family_info;
IREE_RETURN_IF_ERROR(iree_hal_vulkan_select_queue_families(
options, physical_device, syms, &queue_family_info));
// Build queue indices for the selected queue families.
memset(out_compute_queue_set, 0, sizeof(*out_compute_queue_set));
out_compute_queue_set->queue_family_index = queue_family_info.dispatch_index;
for (iree_host_size_t i = 0; i < queue_family_info.dispatch_queue_count;
++i) {
out_compute_queue_set->queue_indices |= 1ull << i;
}
memset(out_transfer_queue_set, 0, sizeof(*out_transfer_queue_set));
out_transfer_queue_set->queue_family_index = queue_family_info.transfer_index;
uint32_t base_queue_index = 0;
if (queue_family_info.dispatch_index == queue_family_info.transfer_index) {
// Sharing a family, so transfer queues follow compute queues.
base_queue_index = queue_family_info.dispatch_index;
}
for (iree_host_size_t i = 0; i < queue_family_info.transfer_queue_count;
++i) {
out_transfer_queue_set->queue_indices |= 1ull << (i + base_queue_index);
}
return iree_ok_status();
}
//===----------------------------------------------------------------------===//
// iree_hal_vulkan_device_t
//===----------------------------------------------------------------------===//
typedef struct iree_hal_vulkan_device_t {
iree_hal_resource_t resource;
iree_string_view_t identifier;
// Optional driver that owns the instance. We retain it for our lifetime to
// ensure the instance remains valid.
iree_hal_driver_t* driver;
// Flags overriding default device behavior.
iree_hal_vulkan_device_flags_t flags;
// Which optional extensions are active and available on the device.
iree_hal_vulkan_device_extensions_t device_extensions;
VkInstance instance;
VkPhysicalDevice physical_device;
VkDeviceHandle* logical_device;
iree_allocator_t host_allocator;
iree_hal_allocator_t* device_allocator;
// Optional provider used for creating/configuring collective channels.
iree_hal_channel_provider_t* channel_provider;
// All queues available on the device; the device owns these.
iree_host_size_t queue_count;
CommandQueue** queues;
// The subset of queues that support dispatch operations. May overlap with
// transfer_queues.
iree_host_size_t dispatch_queue_count;
CommandQueue** dispatch_queues;
// The subset of queues that support transfer operations. May overlap with
// dispatch_queues.
iree_host_size_t transfer_queue_count;
CommandQueue** transfer_queues;
// |queue_count| tracing contexts, if tracing is enabled.
iree_hal_vulkan_tracing_context_t** queue_tracing_contexts;
DescriptorPoolCache* descriptor_pool_cache;
VkCommandPoolHandle* dispatch_command_pool;
VkCommandPoolHandle* transfer_command_pool;
// Block pool used for command buffers with a larger block size (as command
// buffers can contain inlined data uploads).
iree_arena_block_pool_t block_pool;
BuiltinExecutables* builtin_executables;
#if defined(IREE_HAL_VULKAN_HAVE_RENDERDOC)
RENDERDOC_API_LATEST* renderdoc_api;
#endif // IREE_HAL_VULKAN_HAVE_RENDERDOC
} iree_hal_vulkan_device_t;
namespace {
extern const iree_hal_device_vtable_t iree_hal_vulkan_device_vtable;
} // namespace
static iree_hal_vulkan_device_t* iree_hal_vulkan_device_cast(
iree_hal_device_t* base_value) {
IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_device_vtable);
return (iree_hal_vulkan_device_t*)base_value;
}
IREE_API_EXPORT void iree_hal_vulkan_device_options_initialize(
iree_hal_vulkan_device_options_t* out_options) {
memset(out_options, 0, sizeof(*out_options));
out_options->flags = 0;
out_options->large_heap_block_size = 64 * 1024 * 1024;
}
// Creates a transient command pool for the given queue family.
// Command buffers allocated from the pool must only be issued on queues
// belonging to the specified family.
static iree_status_t iree_hal_vulkan_create_transient_command_pool(
VkDeviceHandle* logical_device, uint32_t queue_family_index,
VkCommandPoolHandle** out_handle) {
VkCommandPoolCreateInfo create_info;
create_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
create_info.pNext = NULL;
create_info.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
create_info.queueFamilyIndex = queue_family_index;
VkCommandPoolHandle* command_pool = new VkCommandPoolHandle(logical_device);
iree_status_t status = VK_RESULT_TO_STATUS(
logical_device->syms()->vkCreateCommandPool(
*logical_device, &create_info, logical_device->allocator(),
command_pool->mutable_value()),
"vkCreateCommandPool");
if (iree_status_is_ok(status)) {
*out_handle = command_pool;
} else {
delete command_pool;
}
return status;
}
// Creates a command queue of the given queue family.
static CommandQueue* iree_hal_vulkan_device_create_queue(
VkDeviceHandle* logical_device,
iree_hal_command_category_t command_category, uint32_t queue_family_index,
uint32_t queue_index) {
VkQueue queue = VK_NULL_HANDLE;
logical_device->syms()->vkGetDeviceQueue(*logical_device, queue_family_index,
queue_index, &queue);
return new DirectCommandQueue(logical_device, command_category, queue);
}
// Creates command queues for the given sets of queues and populates the
// device queue lists.
static iree_status_t iree_hal_vulkan_device_initialize_command_queues(
iree_hal_vulkan_device_t* device,
iree_hal_vulkan_features_t enabled_features,
iree_string_view_t queue_prefix,
const iree_hal_vulkan_queue_set_t* compute_queue_set,
const iree_hal_vulkan_queue_set_t* transfer_queue_set) {
device->queue_count = 0;
device->dispatch_queue_count = 0;
device->transfer_queue_count = 0;
// The first available queue supporting dispatch commands that will be used by
// the tracing subsystem for query and cleanup tasks.
VkQueue maintenance_dispatch_queue = VK_NULL_HANDLE;
uint64_t compute_queue_count =
iree_math_count_ones_u64(compute_queue_set->queue_indices);
uint64_t transfer_queue_count =
iree_math_count_ones_u64(transfer_queue_set->queue_indices);
for (iree_host_size_t i = 0; i < compute_queue_count; ++i) {
if (!(compute_queue_set->queue_indices & (1ull << i))) continue;
char queue_name_buffer[32];
int queue_name_length =
snprintf(queue_name_buffer, IREE_ARRAYSIZE(queue_name_buffer),
"Vulkan[%c:%d]", 'D', (int)device->dispatch_queue_count);
iree_string_view_t queue_name =
iree_make_string_view(queue_name_buffer, queue_name_length);
CommandQueue* queue = iree_hal_vulkan_device_create_queue(
device->logical_device, IREE_HAL_COMMAND_CATEGORY_ANY,
compute_queue_set->queue_family_index, i);
iree_host_size_t queue_index = device->queue_count++;
device->queues[queue_index] = queue;
device->dispatch_queues[device->dispatch_queue_count++] = queue;
if (!transfer_queue_count) {
// If we don't have any dedicated transfer queues then use all dispatch
// queues as transfer queues.
device->transfer_queues[device->transfer_queue_count++] = queue;
}
if (maintenance_dispatch_queue == VK_NULL_HANDLE) {
maintenance_dispatch_queue = queue->handle();
}
if (iree_all_bits_set(enabled_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) {
IREE_RETURN_IF_ERROR(iree_hal_vulkan_tracing_context_allocate(
device->physical_device, device->logical_device, queue->handle(),
queue_name, maintenance_dispatch_queue, device->dispatch_command_pool,
device->host_allocator,
&device->queue_tracing_contexts[queue_index]));
queue->set_tracing_context(device->queue_tracing_contexts[queue_index]);
}
}
for (iree_host_size_t i = 0; i < transfer_queue_count; ++i) {
if (!(transfer_queue_set->queue_indices & (1ull << i))) continue;
char queue_name_buffer[32];
int queue_name_length =
snprintf(queue_name_buffer, IREE_ARRAYSIZE(queue_name_buffer),
"Vulkan[%c:%d]", 'T', (int)device->transfer_queue_count);
iree_string_view_t queue_name =
iree_make_string_view(queue_name_buffer, queue_name_length);
CommandQueue* queue = iree_hal_vulkan_device_create_queue(
device->logical_device, IREE_HAL_COMMAND_CATEGORY_TRANSFER,
transfer_queue_set->queue_family_index, i);
iree_host_size_t queue_index = device->queue_count++;
device->queues[queue_index] = queue;
device->transfer_queues[device->transfer_queue_count++] = queue;
if (iree_all_bits_set(enabled_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) {
IREE_RETURN_IF_ERROR(iree_hal_vulkan_tracing_context_allocate(
device->physical_device, device->logical_device, queue->handle(),
queue_name, maintenance_dispatch_queue, device->dispatch_command_pool,
device->host_allocator,
&device->queue_tracing_contexts[queue_index]));
queue->set_tracing_context(device->queue_tracing_contexts[queue_index]);
}
}
return iree_ok_status();
}
static iree_status_t iree_hal_vulkan_device_create_internal(
iree_hal_driver_t* driver, iree_string_view_t identifier,
iree_hal_vulkan_features_t enabled_features,
const iree_hal_vulkan_device_options_t* options, VkInstance instance,
VkPhysicalDevice physical_device, VkDeviceHandle* logical_device,
const iree_hal_vulkan_device_extensions_t* device_extensions,
const iree_hal_vulkan_queue_set_t* compute_queue_set,
const iree_hal_vulkan_queue_set_t* transfer_queue_set,
iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
iree_host_size_t compute_queue_count =
iree_math_count_ones_u64(compute_queue_set->queue_indices);
iree_host_size_t transfer_queue_count =
iree_math_count_ones_u64(transfer_queue_set->queue_indices);
iree_host_size_t total_queue_count =
compute_queue_count + transfer_queue_count;
iree_hal_vulkan_device_t* device = NULL;
iree_host_size_t total_size =
sizeof(*device) + identifier.size +
total_queue_count * sizeof(device->queues[0]) +
total_queue_count * sizeof(device->dispatch_queues[0]) +
total_queue_count * sizeof(device->transfer_queues[0]) +
total_queue_count * sizeof(device->queue_tracing_contexts[0]);
IREE_RETURN_IF_ERROR(
iree_allocator_malloc(host_allocator, total_size, (void**)&device));
memset(device, 0, total_size);
iree_hal_resource_initialize(&iree_hal_vulkan_device_vtable,
&device->resource);
device->host_allocator = host_allocator;
device->driver = driver;
iree_hal_driver_retain(device->driver);
uint8_t* buffer_ptr = (uint8_t*)device + sizeof(*device);
buffer_ptr += iree_string_view_append_to_buffer(
identifier, &device->identifier, (char*)buffer_ptr);
device->flags = options->flags;
device->device_extensions = *device_extensions;
device->instance = instance;
device->physical_device = physical_device;
device->logical_device = logical_device;
device->logical_device->AddReference();
#if defined(IREE_HAL_VULKAN_HAVE_RENDERDOC)
device->renderdoc_api = iree_hal_vulkan_query_renderdoc_api(instance);
#endif // IREE_HAL_VULKAN_HAVE_RENDERDOC
iree_arena_block_pool_initialize(32 * 1024, host_allocator,
&device->block_pool);
// Point the queue storage into the new device allocation. The queues
// themselves are populated
device->queues = (CommandQueue**)buffer_ptr;
buffer_ptr += total_queue_count * sizeof(device->queues[0]);
device->dispatch_queues = (CommandQueue**)buffer_ptr;
buffer_ptr += total_queue_count * sizeof(device->dispatch_queues[0]);
device->transfer_queues = (CommandQueue**)buffer_ptr;
buffer_ptr += total_queue_count * sizeof(device->transfer_queues[0]);
device->queue_tracing_contexts =
(iree_hal_vulkan_tracing_context_t**)buffer_ptr;
buffer_ptr += total_queue_count * sizeof(device->queue_tracing_contexts[0]);
device->descriptor_pool_cache =
new DescriptorPoolCache(device->logical_device);
// Create the device memory allocator that will service all buffer
// allocation requests.
iree_status_t status = iree_ok_status();
if (iree_all_bits_set(options->flags,
IREE_HAL_VULKAN_DEVICE_FLAG_VMA_ALLOCATOR)) {
status = iree_hal_vulkan_vma_allocator_create(
options, instance, physical_device, logical_device,
&device->device_allocator);
} else {
status = iree_hal_vulkan_native_allocator_create(
options, instance, physical_device, logical_device,
&device->device_allocator);
}
// Create command pools for each queue family. If we don't have a transfer
// queue then we'll ignore that one and just use the dispatch pool.
// If we wanted to expose the pools through the HAL to allow the VM to more
// effectively manage them (pool per fiber, etc) we could, however I doubt
// the overhead of locking the pool will be even a blip.
if (iree_status_is_ok(status)) {
status = iree_hal_vulkan_create_transient_command_pool(
device->logical_device, compute_queue_set->queue_family_index,
&device->dispatch_command_pool);
}
if (transfer_queue_set->queue_indices != 0 && iree_status_is_ok(status)) {
status = iree_hal_vulkan_create_transient_command_pool(
device->logical_device, transfer_queue_set->queue_family_index,
&device->transfer_command_pool);
}
// Initialize queues now that we've completed the rest of the device
// initialization; this happens last as the queues require the pools allocated
// above.
if (iree_status_is_ok(status)) {
status = iree_hal_vulkan_device_initialize_command_queues(
device, enabled_features, identifier, compute_queue_set,
transfer_queue_set);
}
if (iree_status_is_ok(status)) {
device->builtin_executables =
new BuiltinExecutables(device->logical_device);
status = device->builtin_executables->InitializeExecutables();
}
if (iree_status_is_ok(status)) {
*out_device = (iree_hal_device_t*)device;
} else {
iree_hal_device_destroy((iree_hal_device_t*)device);
}
return status;
}
static void iree_hal_vulkan_device_destroy(iree_hal_device_t* base_device) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
IREE_TRACE_ZONE_BEGIN(z0);
// Drop all command queues. These may wait until idle in their destructor.
for (iree_host_size_t i = 0; i < device->queue_count; ++i) {
delete device->queues[i];
iree_hal_vulkan_tracing_context_free(device->queue_tracing_contexts[i]);
}
// Drop command pools now that we know there are no more outstanding command
// buffers.
delete device->dispatch_command_pool;
delete device->transfer_command_pool;
// Now that no commands are outstanding we can release all resources that may
// have been in use.
delete device->builtin_executables;
delete device->descriptor_pool_cache;
// There should be no more buffers live that use the allocator.
iree_hal_allocator_release(device->device_allocator);
// Buffers may have been retaining collective resources.
iree_hal_channel_provider_release(device->channel_provider);
// All arena blocks should have been returned.
iree_arena_block_pool_deinitialize(&device->block_pool);
// Finally, destroy the device.
device->logical_device->ReleaseReference();
iree_hal_driver_release(device->driver);
iree_allocator_free(host_allocator, device);
IREE_TRACE_ZONE_END(z0);
}
static iree_status_t iree_hal_vulkan_device_query_extensibility_set(
iree_hal_vulkan_features_t requested_features,
iree_hal_vulkan_extensibility_set_t set, iree::Arena* arena,
iree_hal_vulkan_string_list_t* out_string_list) {
IREE_RETURN_IF_ERROR(iree_hal_vulkan_query_extensibility_set(
requested_features, set, 0, &out_string_list->count, NULL));
out_string_list->values = (const char**)arena->AllocateBytes(
out_string_list->count * sizeof(out_string_list->values[0]));
IREE_RETURN_IF_ERROR(iree_hal_vulkan_query_extensibility_set(
requested_features, set, out_string_list->count, &out_string_list->count,
out_string_list->values));
return iree_ok_status();
}
iree_status_t iree_hal_vulkan_device_create(
iree_hal_driver_t* driver, iree_string_view_t identifier,
iree_hal_vulkan_features_t requested_features,
const iree_hal_vulkan_device_options_t* options,
iree_hal_vulkan_syms_t* opaque_syms, VkInstance instance,
VkPhysicalDevice physical_device, iree_allocator_t host_allocator,
iree_hal_device_t** out_device) {
DynamicSymbols* instance_syms = (DynamicSymbols*)opaque_syms;
// Find the extensions we need (or want) that are also available
// on the device. This will fail when required ones are not present.
// TODO(benvanik): replace with a real arena.
iree::Arena arena(128 * 1024);
iree_hal_vulkan_string_list_t required_extensions;
IREE_RETURN_IF_ERROR(iree_hal_vulkan_device_query_extensibility_set(
requested_features,
IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED, &arena,
&required_extensions));
iree_hal_vulkan_string_list_t optional_extensions;
IREE_RETURN_IF_ERROR(iree_hal_vulkan_device_query_extensibility_set(
requested_features,
IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL, &arena,
&optional_extensions));
iree_hal_vulkan_string_list_t enabled_extensions;
IREE_RETURN_IF_ERROR(iree_hal_vulkan_match_available_device_extensions(
instance_syms, physical_device, &required_extensions,
&optional_extensions, &arena, &enabled_extensions));
iree_hal_vulkan_device_extensions_t enabled_device_extensions =
iree_hal_vulkan_populate_enabled_device_extensions(&enabled_extensions);
// Find queue families we will expose as HAL queues.
iree_hal_vulkan_queue_family_info_t queue_family_info;
IREE_RETURN_IF_ERROR(iree_hal_vulkan_select_queue_families(
options, physical_device, instance_syms, &queue_family_info));
bool has_dedicated_transfer_queues =
queue_family_info.transfer_queue_count > 0;
// TODO(benvanik): convert to using the arena.
// Setup the queue info we'll be using.
// Each queue here (created from within a family) will map to a HAL queue.
//
// Note that we need to handle the case where we have transfer queues that
// are of the same queue family as the dispatch queues: Vulkan requires that
// all queues created from the same family are done in the same
// VkDeviceQueueCreateInfo struct.
std::vector<VkDeviceQueueCreateInfo> queue_create_info;
// Reserve space for create infos. Note: must be the maximum used, or else
// references used below will be invalidated as the vector grows.
queue_create_info.reserve(2);
std::vector<float> dispatch_queue_priorities;
std::vector<float> transfer_queue_priorities;
queue_create_info.push_back({});
auto& dispatch_queue_info = queue_create_info.back();
dispatch_queue_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
dispatch_queue_info.pNext = NULL;
dispatch_queue_info.flags = 0;
dispatch_queue_info.queueFamilyIndex = queue_family_info.dispatch_index;
dispatch_queue_info.queueCount = queue_family_info.dispatch_queue_count;
if (has_dedicated_transfer_queues) {
if (queue_family_info.dispatch_index == queue_family_info.transfer_index) {
dispatch_queue_info.queueCount += queue_family_info.transfer_queue_count;
} else {
queue_create_info.push_back({});
auto& transfer_queue_info = queue_create_info.back();
transfer_queue_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
transfer_queue_info.pNext = NULL;
transfer_queue_info.queueFamilyIndex = queue_family_info.transfer_index;
transfer_queue_info.queueCount = queue_family_info.transfer_queue_count;
transfer_queue_info.flags = 0;
transfer_queue_priorities.resize(transfer_queue_info.queueCount);
transfer_queue_info.pQueuePriorities = transfer_queue_priorities.data();
}
}
dispatch_queue_priorities.resize(dispatch_queue_info.queueCount);
dispatch_queue_info.pQueuePriorities = dispatch_queue_priorities.data();
// Collect supported physical device features.
VkPhysicalDeviceFeatures2 available_features2;
memset(&available_features2, 0, sizeof(available_features2));
available_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
VkPhysicalDeviceBufferDeviceAddressFeatures
available_buffer_device_address_features;
memset(&available_buffer_device_address_features, 0,
sizeof(available_buffer_device_address_features));
available_buffer_device_address_features.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES;
available_features2.pNext = &available_buffer_device_address_features;
instance_syms->vkGetPhysicalDeviceFeatures2(physical_device,
&available_features2);
const VkPhysicalDeviceFeatures* available_features =
&available_features2.features;
// Create device and its queues.
VkDeviceCreateInfo device_create_info;
memset(&device_create_info, 0, sizeof(device_create_info));
device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
device_create_info.enabledLayerCount = 0;
device_create_info.ppEnabledLayerNames = NULL;
device_create_info.enabledExtensionCount = enabled_extensions.count;
device_create_info.ppEnabledExtensionNames = enabled_extensions.values;
device_create_info.queueCreateInfoCount = queue_create_info.size();
device_create_info.pQueueCreateInfos = queue_create_info.data();
device_create_info.pEnabledFeatures = NULL;
VkPhysicalDeviceFeatures2 enabled_features2;
memset(&enabled_features2, 0, sizeof(enabled_features2));
enabled_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
device_create_info.pNext = &enabled_features2;
if (available_features->shaderInt64) {
enabled_features2.features.shaderInt64 = VK_TRUE;
}
iree_hal_vulkan_features_t enabled_features = 0;
IREE_TRACE({
if (iree_all_bits_set(requested_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) {
enabled_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING;
}
});
if (iree_all_bits_set(requested_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_SPARSE_BINDING) &&
available_features->sparseBinding) {
enabled_features2.features.sparseBinding = VK_TRUE;
enabled_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_SPARSE_BINDING;
}
if (iree_all_bits_set(
requested_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_SPARSE_RESIDENCY_ALIASED) &&
available_features->sparseResidencyBuffer &&
available_features->sparseResidencyAliased) {
enabled_features2.features.sparseResidencyBuffer = VK_TRUE;
enabled_features2.features.sparseResidencyAliased = VK_TRUE;
enabled_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_SPARSE_RESIDENCY_ALIASED;
}
if (iree_all_bits_set(requested_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_ROBUST_BUFFER_ACCESS)) {
if (available_features->robustBufferAccess != VK_TRUE) {
return iree_make_status(
IREE_STATUS_UNAVAILABLE,
"robust buffer access not supported by physical device");
}
enabled_features2.features.robustBufferAccess = VK_TRUE;
enabled_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_ROBUST_BUFFER_ACCESS;
}
VkPhysicalDeviceBufferDeviceAddressFeatures buffer_device_address_features;
if (iree_all_bits_set(
requested_features,
IREE_HAL_VULKAN_FEATURE_ENABLE_BUFFER_DEVICE_ADDRESSES) &&
available_buffer_device_address_features.bufferDeviceAddress) {
memset(&buffer_device_address_features, 0,
sizeof(buffer_device_address_features));
buffer_device_address_features.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES;
buffer_device_address_features.pNext = enabled_features2.pNext;
enabled_features2.pNext = &buffer_device_address_features;
buffer_device_address_features.bufferDeviceAddress = true;
enabled_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_BUFFER_DEVICE_ADDRESSES;
}
VkPhysicalDeviceTimelineSemaphoreFeatures semaphore_features;
memset(&semaphore_features, 0, sizeof(semaphore_features));
semaphore_features.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES;
semaphore_features.pNext = enabled_features2.pNext;
enabled_features2.pNext = &semaphore_features;
semaphore_features.timelineSemaphore = VK_TRUE;
VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset_features;
if (enabled_device_extensions.host_query_reset) {
memset(&host_query_reset_features, 0, sizeof(host_query_reset_features));
host_query_reset_features.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT;
host_query_reset_features.pNext = enabled_features2.pNext;
enabled_features2.pNext = &host_query_reset_features;
host_query_reset_features.hostQueryReset = VK_TRUE;
}
VkPhysicalDeviceSubgroupSizeControlFeatures subgroup_control_features;
if (enabled_device_extensions.subgroup_size_control) {
memset(&subgroup_control_features, 0, sizeof(subgroup_control_features));
subgroup_control_features.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES;
subgroup_control_features.pNext = enabled_features2.pNext;
enabled_features2.pNext = &subgroup_control_features;
subgroup_control_features.subgroupSizeControl = VK_TRUE;
}
auto logical_device = new VkDeviceHandle(
instance_syms, physical_device, enabled_features,
enabled_device_extensions,
/*owns_device=*/true, host_allocator, /*allocator=*/NULL);
iree_status_t status = VK_RESULT_TO_STATUS(
instance_syms->vkCreateDevice(physical_device, &device_create_info,
logical_device->allocator(),
logical_device->mutable_value()),
"vkCreateDevice");
if (iree_status_is_ok(status)) {
status = logical_device->syms()->LoadFromDevice(instance,
logical_device->value());
}
// Select queue indices and create command queues with them.
iree_hal_vulkan_queue_set_t compute_queue_set;
iree_hal_vulkan_queue_set_t transfer_queue_set;
if (iree_status_is_ok(status)) {
status = iree_hal_vulkan_build_queue_sets(
options, physical_device, logical_device->syms().get(),
&compute_queue_set, &transfer_queue_set);
}
// Allocate and initialize the device.
if (iree_status_is_ok(status)) {
status = iree_hal_vulkan_device_create_internal(
driver, identifier, enabled_features, options, instance,
physical_device, logical_device, &enabled_device_extensions,
&compute_queue_set, &transfer_queue_set, host_allocator, out_device);
}
logical_device->ReleaseReference();
return status;
}
IREE_API_EXPORT iree_status_t iree_hal_vulkan_wrap_device(
iree_string_view_t identifier,
const iree_hal_vulkan_device_options_t* options,
const iree_hal_vulkan_syms_t* instance_syms, VkInstance instance,
VkPhysicalDevice physical_device, VkDevice logical_device,
const iree_hal_vulkan_queue_set_t* compute_queue_set,
const iree_hal_vulkan_queue_set_t* transfer_queue_set,
iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
IREE_ASSERT_ARGUMENT(instance_syms);
IREE_ASSERT_ARGUMENT(instance);
IREE_ASSERT_ARGUMENT(physical_device);
IREE_ASSERT_ARGUMENT(logical_device);
IREE_ASSERT_ARGUMENT(out_device);
if (iree_math_count_ones_u64(compute_queue_set->queue_indices) == 0) {
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"at least one compute queue is required");
}
// Grab symbols from the device.
auto device_syms = iree::make_ref<DynamicSymbols>();
device_syms->vkGetInstanceProcAddr =
((const DynamicSymbols*)instance_syms)->vkGetInstanceProcAddr;
IREE_RETURN_IF_ERROR(device_syms->LoadFromDevice(instance, logical_device));
// Since the device is already created, we can't actually enable any
// extensions or query if they are really enabled - we just have to trust
// that the caller already enabled them for us or we may fail later. For the
// optional extensions we check for the symbols but this is not always
// guaranteed to work.
iree_hal_vulkan_device_extensions_t enabled_device_extensions =
iree_hal_vulkan_infer_enabled_device_extensions(device_syms.get());
iree_hal_vulkan_features_t enabled_features = 0;
#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
enabled_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING;
#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
// Wrap the provided VkDevice with a VkDeviceHandle for use within the HAL.
auto logical_device_handle = new VkDeviceHandle(
device_syms.get(), physical_device, enabled_features,
enabled_device_extensions,
/*owns_device=*/false, host_allocator, /*allocator=*/NULL);
*logical_device_handle->mutable_value() = logical_device;
// Allocate and initialize the device.
iree_status_t status = iree_hal_vulkan_device_create_internal(
/*driver=*/NULL, identifier, enabled_features, options, instance,
physical_device, logical_device_handle, &enabled_device_extensions,
compute_queue_set, transfer_queue_set, host_allocator, out_device);
logical_device_handle->ReleaseReference();
return status;
}
static iree_string_view_t iree_hal_vulkan_device_id(
iree_hal_device_t* base_device) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
return device->identifier;
}
static iree_allocator_t iree_hal_vulkan_device_host_allocator(
iree_hal_device_t* base_device) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
return device->host_allocator;
}
static iree_hal_allocator_t* iree_hal_vulkan_device_allocator(
iree_hal_device_t* base_device) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
return device->device_allocator;
}
static void iree_hal_vulkan_replace_device_allocator(
iree_hal_device_t* base_device, iree_hal_allocator_t* new_allocator) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
iree_hal_allocator_retain(new_allocator);
iree_hal_allocator_release(device->device_allocator);
device->device_allocator = new_allocator;
}
static void iree_hal_vulkan_replace_channel_provider(
iree_hal_device_t* base_device, iree_hal_channel_provider_t* new_provider) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
iree_hal_channel_provider_retain(new_provider);
iree_hal_channel_provider_release(device->channel_provider);
device->channel_provider = new_provider;
}
static iree_status_t iree_hal_vulkan_device_trim(
iree_hal_device_t* base_device) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
iree_arena_block_pool_trim(&device->block_pool);
return iree_hal_allocator_trim(device->device_allocator);
}
static iree_status_t iree_hal_vulkan_device_query_i64(
iree_hal_device_t* base_device, iree_string_view_t category,
iree_string_view_t key, int64_t* out_value) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
*out_value = 0;
if (iree_string_view_equal(category,
iree_make_cstring_view("hal.executable.format"))) {
if (iree_string_view_equal(key,
iree_make_cstring_view("vulkan-spirv-fb"))) {
// Base SPIR-V always supported.
*out_value = 1;
} else if (iree_string_view_equal(
key, iree_make_cstring_view("vulkan-spirv-fb-ptr"))) {
// SPIR-V with device addresses is optionally supported based on whether
// we have device feature support.
*out_value = iree_all_bits_set(
device->logical_device->enabled_features(),
IREE_HAL_VULKAN_FEATURE_ENABLE_BUFFER_DEVICE_ADDRESSES)
? 1
: 0;
}
return iree_ok_status();
}
return iree_make_status(
IREE_STATUS_NOT_FOUND,
"unknown device configuration key value '%.*s :: %.*s'",
(int)category.size, category.data, (int)key.size, key.data);
}
// Returns the queue to submit work to based on the |queue_affinity|.
static CommandQueue* iree_hal_vulkan_device_select_queue(
iree_hal_vulkan_device_t* device,
iree_hal_command_category_t command_categories,
iree_hal_queue_affinity_t queue_affinity) {
// TODO(scotttodd): revisit queue selection logic and remove this
// * the unaligned buffer fill polyfill and tracing timestamp queries may
// both insert dispatches into command buffers that at compile time are
// expected to only contain transfer commands
// * we could set a bit at recording time if emulation or tracing is used
// and submit to the right queue based on that
command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH;
// TODO(benvanik): meaningful heuristics for affinity. We don't generate
// anything from the compiler that uses multiple queues and until we do it's
// best not to do anything too clever here.
if (command_categories == IREE_HAL_COMMAND_CATEGORY_TRANSFER) {
return device
->transfer_queues[queue_affinity % device->transfer_queue_count];
}
return device->dispatch_queues[queue_affinity % device->dispatch_queue_count];
}
static iree_status_t iree_hal_vulkan_device_create_channel(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
iree_hal_channel_params_t params, iree_hal_channel_t** out_channel) {
return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
"collectives not implemented");
}
static iree_status_t iree_hal_vulkan_device_create_command_buffer(
iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode,
iree_hal_command_category_t command_categories,
iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
iree_hal_command_buffer_t** out_command_buffer) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
// TODO(scotttodd): revisit queue selection logic and remove this
// * the unaligned buffer fill polyfill and tracing timestamp queries may
// both insert dispatches into command buffers that at compile time are
// expected to only contain transfer commands
// * we could set a bit at recording time if emulation or tracing is used
// and submit to the right queue based on that
command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH;
// Select the command pool to used based on the types of commands used.
// Note that we may not have a dedicated transfer command pool if there are
// no dedicated transfer queues.
VkCommandPoolHandle* command_pool = NULL;
if (device->transfer_command_pool &&
!iree_all_bits_set(command_categories,
IREE_HAL_COMMAND_CATEGORY_DISPATCH)) {
command_pool = device->transfer_command_pool;
} else {
command_pool = device->dispatch_command_pool;
}
// The tracing context is tied to a particular queue so we must select here
// even though ideally we'd do it during submission. This is informational
// only and if the user does provide a different queue affinity during
// submission it just means the commands will be attributed to the wrong
// queue.
CommandQueue* queue = iree_hal_vulkan_device_select_queue(
device, command_categories, queue_affinity);
return iree_hal_vulkan_direct_command_buffer_allocate(
base_device, device->logical_device, command_pool, mode,
command_categories, queue_affinity, binding_capacity,
queue->tracing_context(), device->descriptor_pool_cache,
device->builtin_executables, &device->block_pool, out_command_buffer);
}
static iree_status_t iree_hal_vulkan_device_create_descriptor_set_layout(
iree_hal_device_t* base_device,
iree_hal_descriptor_set_layout_flags_t flags,
iree_host_size_t binding_count,
const iree_hal_descriptor_set_layout_binding_t* bindings,
iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
return iree_hal_vulkan_native_descriptor_set_layout_create(
device->logical_device, flags, binding_count, bindings,
out_descriptor_set_layout);
}
static iree_status_t iree_hal_vulkan_device_create_event(
iree_hal_device_t* base_device, iree_hal_event_t** out_event) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
return iree_hal_vulkan_native_event_create(device->logical_device, out_event);
}
static iree_status_t iree_hal_vulkan_device_create_executable_cache(
iree_hal_device_t* base_device, iree_string_view_t identifier,
iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
return iree_hal_vulkan_nop_executable_cache_create(
device->logical_device, identifier, out_executable_cache);
}
static iree_status_t iree_hal_vulkan_device_import_file(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
iree_hal_memory_access_t access,
iree_hal_external_file_t* IREE_RESTRICT external_file,
iree_hal_file_release_callback_t release_callback,
iree_hal_file_t** out_file) {
if (external_file->type != IREE_HAL_EXTERNAL_FILE_TYPE_HOST_ALLOCATION) {
return iree_make_status(
IREE_STATUS_UNAVAILABLE,
"implementation does not support the external file type");
}
return iree_hal_memory_file_wrap(
queue_affinity, access, external_file->handle.host_allocation,
release_callback, iree_hal_device_allocator(base_device),
iree_hal_device_host_allocator(base_device), out_file);
}
static iree_status_t iree_hal_vulkan_device_create_pipeline_layout(
iree_hal_device_t* base_device, iree_host_size_t push_constants,
iree_host_size_t set_layout_count,
iree_hal_descriptor_set_layout_t* const* set_layouts,
iree_hal_pipeline_layout_t** out_pipeline_layout) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
return iree_hal_vulkan_native_pipeline_layout_create(
device->logical_device, push_constants, set_layout_count, set_layouts,
out_pipeline_layout);
}
static iree_status_t iree_hal_vulkan_device_create_semaphore(
iree_hal_device_t* base_device, uint64_t initial_value,
iree_hal_semaphore_t** out_semaphore) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
return iree_hal_vulkan_native_semaphore_create(device->logical_device,
initial_value, out_semaphore);
}
static iree_hal_semaphore_compatibility_t
iree_hal_vulkan_device_query_semaphore_compatibility(
iree_hal_device_t* base_device, iree_hal_semaphore_t* semaphore) {
if (iree_hal_vulkan_native_semaphore_isa(semaphore)) {
// Fast-path for semaphores related to this device.
// TODO(benvanik): ensure the creating devices are compatible in cases where
// multiple devices are used.
return IREE_HAL_SEMAPHORE_COMPATIBILITY_ALL;
}
// TODO(benvanik): semaphore APIs for querying allowed export formats. We
// can check device caps to see what external semaphore types are supported.
return IREE_HAL_SEMAPHORE_COMPATIBILITY_HOST_ONLY;
}
static iree_status_t iree_hal_vulkan_device_queue_alloca(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
iree_device_size_t allocation_size,
iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
// TODO(benvanik): queue-ordered allocations.
IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
iree_infinite_timeout()));
IREE_RETURN_IF_ERROR(
iree_hal_allocator_allocate_buffer(iree_hal_device_allocator(base_device),
params, allocation_size, out_buffer));
IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list));
return iree_ok_status();
}
static iree_status_t iree_hal_vulkan_device_queue_dealloca(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* buffer) {
// TODO(benvanik): queue-ordered allocations.
IREE_RETURN_IF_ERROR(iree_hal_device_queue_barrier(
base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list));
return iree_ok_status();
}
static iree_status_t iree_hal_vulkan_device_queue_read(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_file_t* source_file, uint64_t source_offset,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, uint32_t flags) {
// TODO: expose streaming chunk count/size options.
iree_status_t loop_status = iree_ok_status();
iree_hal_file_transfer_options_t options = {
/*.loop=*/iree_loop_inline(&loop_status),
/*.chunk_count=*/IREE_HAL_FILE_TRANSFER_CHUNK_COUNT_DEFAULT,
/*.chunk_size=*/IREE_HAL_FILE_TRANSFER_CHUNK_SIZE_DEFAULT,
};
IREE_RETURN_IF_ERROR(iree_hal_device_queue_read_streaming(
base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
source_file, source_offset, target_buffer, target_offset, length, flags,
options));
return loop_status;
}
static iree_status_t iree_hal_vulkan_device_queue_write(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
iree_hal_file_t* target_file, uint64_t target_offset,
iree_device_size_t length, uint32_t flags) {
// TODO: expose streaming chunk count/size options.
iree_status_t loop_status = iree_ok_status();
iree_hal_file_transfer_options_t options = {
/*.loop=*/iree_loop_inline(&loop_status),
/*.chunk_count=*/IREE_HAL_FILE_TRANSFER_CHUNK_COUNT_DEFAULT,
/*.chunk_size=*/IREE_HAL_FILE_TRANSFER_CHUNK_SIZE_DEFAULT,
};
IREE_RETURN_IF_ERROR(iree_hal_device_queue_write_streaming(
base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
source_buffer, source_offset, target_file, target_offset, length, flags,
options));
return loop_status;
}
static iree_status_t iree_hal_vulkan_device_queue_execute(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_host_size_t command_buffer_count,
iree_hal_command_buffer_t* const* command_buffers) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
// NOTE: today we are not discriminating queues based on command type.
CommandQueue* queue = iree_hal_vulkan_device_select_queue(
device, IREE_HAL_COMMAND_CATEGORY_DISPATCH, queue_affinity);
iree_hal_submission_batch_t batch = {
/*.wait_semaphores=*/wait_semaphore_list,
/*.command_buffer_count=*/command_buffer_count,
/*.command_buffers=*/command_buffers,
/*.signal_semaphores=*/signal_semaphore_list,
};
IREE_RETURN_IF_ERROR(queue->Submit(1, &batch));
// HACK: we don't track async resource lifetimes so we have to block.
return iree_hal_semaphore_list_wait(signal_semaphore_list,
iree_infinite_timeout());
}
static iree_status_t iree_hal_vulkan_device_queue_flush(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity) {
// Currently unused; we flush as submissions are made.
return iree_ok_status();
}
static iree_status_t iree_hal_vulkan_device_wait_semaphores(
iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
VkSemaphoreWaitFlags wait_flags = 0;
if (wait_mode == IREE_HAL_WAIT_MODE_ANY) {
wait_flags |= VK_SEMAPHORE_WAIT_ANY_BIT;
}
return iree_hal_vulkan_native_semaphore_multi_wait(
device->logical_device, &semaphore_list, timeout, wait_flags);
}
static iree_status_t iree_hal_vulkan_device_profiling_begin(
iree_hal_device_t* base_device,
const iree_hal_device_profiling_options_t* options) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
(void)device;
if (iree_all_bits_set(options->mode,
IREE_HAL_DEVICE_PROFILING_MODE_QUEUE_OPERATIONS)) {
// AMD-specific - we could snoop the device to only do this for the vendor
// but this is relatively cheap and could be useful to others. Ideally
// there would be a khronos standard for this.
// TODO(benvanik): figure out if we need to do this for all queues.
auto& syms = device->logical_device->syms();
if (syms->vkQueueInsertDebugUtilsLabelEXT) {
VkDebugUtilsLabelEXT begin_label = {};
begin_label.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT;
begin_label.pNext = NULL;
begin_label.pLabelName = "AmdFrameBegin";
device->logical_device->syms()->vkQueueInsertDebugUtilsLabelEXT(
device->dispatch_queues[0]->handle(), &begin_label);
}
// For now we only support RenderDoc. As much as possible we should try to
// use standardized Vulkan layers to do profiling configuration/control like
// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_KHR_performance_query.html
// to avoid the combinatorial explosion of vendor tooling hooks.
// Since RenderDoc is fairly simple, cross-platform, and cross-vendor we
// support it here. If this grows beyond a few lines of code we should
// shuffle it off to another file.
#if defined(IREE_HAL_VULKAN_HAVE_RENDERDOC)
iree_hal_vulkan_begin_renderdoc_capture(device->renderdoc_api,
device->instance, options);
#endif // IREE_HAL_VULKAN_HAVE_RENDERDOC
}
return iree_ok_status();
}
static iree_status_t iree_hal_vulkan_device_profiling_flush(
iree_hal_device_t* base_device) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
(void)device;
#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
if (iree_all_bits_set(device->logical_device->enabled_features(),
IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) {
for (iree_host_size_t i = 0; i < device->queue_count; ++i) {
iree_hal_vulkan_tracing_context_t* tracing_context =
device->queues[i]->tracing_context();
if (tracing_context) {
iree_hal_vulkan_tracing_context_collect(tracing_context,
VK_NULL_HANDLE);
}
}
}
#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
return iree_ok_status();
}
static iree_status_t iree_hal_vulkan_device_profiling_end(
iree_hal_device_t* base_device) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
(void)device;
#if defined(IREE_HAL_VULKAN_HAVE_RENDERDOC)
iree_hal_vulkan_end_renderdoc_capture(device->renderdoc_api,
device->instance);
#endif // IREE_HAL_VULKAN_HAVE_RENDERDOC
// AMD-specific.
auto& syms = device->logical_device->syms();
if (syms->vkQueueInsertDebugUtilsLabelEXT) {
VkDebugUtilsLabelEXT end_label = {};
end_label.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT;
end_label.pNext = NULL;
end_label.pLabelName = "AmdFrameEnd";
device->logical_device->syms()->vkQueueInsertDebugUtilsLabelEXT(
device->dispatch_queues[0]->handle(), &end_label);
}
return iree_ok_status();
}
namespace {
const iree_hal_device_vtable_t iree_hal_vulkan_device_vtable = {
/*.destroy=*/iree_hal_vulkan_device_destroy,
/*.id=*/iree_hal_vulkan_device_id,
/*.host_allocator=*/iree_hal_vulkan_device_host_allocator,
/*.device_allocator=*/iree_hal_vulkan_device_allocator,
/*.replace_device_allocator=*/iree_hal_vulkan_replace_device_allocator,
/*.replace_channel_provider=*/iree_hal_vulkan_replace_channel_provider,
/*.trim=*/iree_hal_vulkan_device_trim,
/*.query_i64=*/iree_hal_vulkan_device_query_i64,
/*.create_channel=*/iree_hal_vulkan_device_create_channel,
/*.create_command_buffer=*/iree_hal_vulkan_device_create_command_buffer,
/*.create_descriptor_set_layout=*/
iree_hal_vulkan_device_create_descriptor_set_layout,
/*.create_event=*/iree_hal_vulkan_device_create_event,
/*.create_executable_cache=*/
iree_hal_vulkan_device_create_executable_cache,
/*.import_file=*/iree_hal_vulkan_device_import_file,
/*.create_pipeline_layout=*/
iree_hal_vulkan_device_create_pipeline_layout,
/*.create_semaphore=*/iree_hal_vulkan_device_create_semaphore,
/*.query_semaphore_compatibility=*/
iree_hal_vulkan_device_query_semaphore_compatibility,
/*.transfer_range=*/iree_hal_device_submit_transfer_range_and_wait,
/*.queue_alloca=*/iree_hal_vulkan_device_queue_alloca,
/*.queue_dealloca=*/iree_hal_vulkan_device_queue_dealloca,
/*.queue_read=*/iree_hal_vulkan_device_queue_read,
/*.queue_write=*/iree_hal_vulkan_device_queue_write,
/*.queue_execute=*/iree_hal_vulkan_device_queue_execute,
/*.queue_flush=*/iree_hal_vulkan_device_queue_flush,
/*.wait_semaphores=*/iree_hal_vulkan_device_wait_semaphores,
/*.profiling_begin=*/iree_hal_vulkan_device_profiling_begin,
/*.profiling_flush=*/iree_hal_vulkan_device_profiling_flush,
/*.profiling_end=*/iree_hal_vulkan_device_profiling_end,
};
} // namespace