| // Copyright 2021 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/hal/drivers/vulkan/tracing.h" |
| |
| #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION |
| |
| #include "iree/base/api.h" |
| #include "iree/base/target_platform.h" |
| #include "third_party/tracy/Tracy.hpp" |
| #include "third_party/tracy/client/TracyProfiler.hpp" |
| #include "third_party/tracy/common/TracyAlloc.hpp" |
| |
| // Total number of queries the per-queue query pool will contain. This |
| // translates to the maximum number of outstanding queries before collection is |
| // required. |
| #define IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY (32 * 1024) |
| |
| // Total number of queries that can be read back from the API in a single |
| // collection. |
| #define IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY (8 * 1024) |
| |
| // Number of times we will query the max_deviation from calibrated timestamps. |
| // The more we do the better confidence we have in a lower-bound. |
| #define IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT 32 |
| |
| typedef struct iree_hal_vulkan_timestamp_query_t { |
| uint64_t timestamp; |
| uint64_t availability; // non-zero if available |
| } iree_hal_vulkan_timestamp_query_t; |
| |
| struct iree_hal_vulkan_tracing_context_t { |
| // Device and queue the context represents. |
| iree::hal::vulkan::VkDeviceHandle* logical_device; |
| VkQueue queue; |
| iree_allocator_t host_allocator; |
| |
| // Maintenance queue that supports dispatch commands and can be used to reset |
| // queries. |
| VkQueue maintenance_dispatch_queue; |
| // Command pool that serves command buffers compatible with the |
| // |maintenance_dispatch_queue|. |
| iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool; |
| |
| // A unique GPU zone ID allocated from Tracy. |
| // There is a global limit of 255 GPU zones (ID 255 is special). |
| uint8_t id; |
| |
| // Defines how the timestamps are interpreted (device-specific, posix, QPC). |
| // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkTimeDomainEXT.html |
| VkTimeDomainEXT time_domain; |
| |
| // Maximum expected deviation between CPU and GPU timestamps based on an |
| // average computed at startup. Calibration events that exceed this value are |
| // discarded. |
| uint64_t max_expected_deviation; |
| |
| // Vulkan-reported CPU timestamp of the last calibration. |
| // Used to detect when drift occurs and we need to notify tracy. |
| uint64_t previous_cpu_time; |
| |
| // Pool of query instances that we treat as a backing store for a ringbuffer. |
| VkQueryPool query_pool; |
| |
| // Indices into |query_pool| defining a ringbuffer. |
| uint32_t query_head; |
| uint32_t query_tail; |
| uint32_t query_capacity; |
| |
| // Readback storage; large enough to get a decent chunk of queries back from |
| // the API in one shot. |
| // |
| // Data is stored as [[timestamp, availability], ...]. |
| // Availability will be non-zero if the timestamp is valid. Since we put all |
| // timestamps in order once we reach an unavailable timestamp we can bail |
| // and leave that for future collections. |
| iree_hal_vulkan_timestamp_query_t |
| readback_buffer[IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY]; |
| }; |
| |
| // Allocates and begins a command buffer and returns its handle. |
| // Returns VK_NULL_HANDLE if allocation fails. |
| static VkCommandBuffer iree_hal_vulkan_tracing_begin_command_buffer( |
| iree_hal_vulkan_tracing_context_t* context) { |
| const auto& syms = context->logical_device->syms(); |
| |
| VkCommandBufferAllocateInfo command_buffer_info; |
| memset(&command_buffer_info, 0, sizeof(command_buffer_info)); |
| command_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; |
| command_buffer_info.commandPool = *context->maintenance_command_pool; |
| command_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; |
| command_buffer_info.commandBufferCount = 1; |
| VkCommandBuffer command_buffer = VK_NULL_HANDLE; |
| IREE_IGNORE_ERROR(context->maintenance_command_pool->Allocate( |
| &command_buffer_info, &command_buffer)); |
| if (!command_buffer) return VK_NULL_HANDLE; |
| |
| VkCommandBufferBeginInfo begin_info; |
| memset(&begin_info, 0, sizeof(begin_info)); |
| begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; |
| begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; |
| syms->vkBeginCommandBuffer(command_buffer, &begin_info); |
| |
| return command_buffer; |
| } |
| |
| // Ends and submits |command_buffer| and waits for it to complete. |
| static void iree_hal_vulkan_tracing_submit_command_buffer( |
| iree_hal_vulkan_tracing_context_t* context, |
| VkCommandBuffer command_buffer) { |
| const auto& syms = context->logical_device->syms(); |
| |
| syms->vkEndCommandBuffer(command_buffer); |
| |
| VkSubmitInfo submit_info; |
| memset(&submit_info, 0, sizeof(submit_info)); |
| submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; |
| submit_info.commandBufferCount = 1; |
| submit_info.pCommandBuffers = &command_buffer; |
| syms->vkQueueSubmit(context->maintenance_dispatch_queue, 1, &submit_info, |
| VK_NULL_HANDLE); |
| syms->vkQueueWaitIdle(context->maintenance_dispatch_queue); |
| |
| context->maintenance_command_pool->Free(command_buffer); |
| } |
| |
| // Synchronously resets a range of querys in a query pool. |
| // This may submit commands to the queue. |
| static void iree_hal_vulkan_tracing_reset_query_pool( |
| iree_hal_vulkan_tracing_context_t* context, uint32_t query_index, |
| uint32_t query_count) { |
| const auto& syms = context->logical_device->syms(); |
| |
| // Fast-path for when host-side vkResetQueryPool is available. |
| // This is core in Vulkan 1.2. |
| if (context->logical_device->enabled_extensions().host_query_reset) { |
| PFN_vkResetQueryPool vkResetQueryPool_fn = syms->vkResetQueryPool |
| ? syms->vkResetQueryPool |
| : syms->vkResetQueryPoolEXT; |
| if (vkResetQueryPool_fn != NULL) { |
| vkResetQueryPool_fn(*context->logical_device, context->query_pool, |
| query_index, query_count); |
| return; |
| } |
| } |
| |
| // Slow-path submitting a command buffer to reset the query pool. It's obvious |
| // why vkResetQueryPool was added :) |
| VkCommandBuffer command_buffer = |
| iree_hal_vulkan_tracing_begin_command_buffer(context); |
| if (command_buffer != VK_NULL_HANDLE) { |
| syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_index, |
| query_count); |
| iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer); |
| } |
| } |
| |
| // Attempts to get a timestamp from both the CPU and GPU that are correlated |
| // with each other. Only valid when calibration is supported. |
| static void iree_hal_vulkan_tracing_query_calibration_timestamps( |
| iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time, |
| uint64_t* out_gpu_time) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| *out_cpu_time = 0; |
| *out_gpu_time = 0; |
| |
| VkCalibratedTimestampInfoEXT timestamp_infos[2]; |
| timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; |
| timestamp_infos[0].pNext = NULL; |
| timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT; |
| timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; |
| timestamp_infos[1].pNext = NULL; |
| timestamp_infos[1].timeDomain = context->time_domain; |
| uint64_t timestamps[2] = {0, 0}; |
| uint64_t max_deviation = 0; |
| do { |
| context->logical_device->syms()->vkGetCalibratedTimestampsEXT( |
| *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos, |
| timestamps, &max_deviation); |
| } while (max_deviation > context->max_expected_deviation); |
| |
| *out_gpu_time = timestamps[0]; |
| *out_cpu_time = timestamps[1]; |
| switch (context->time_domain) { |
| #if defined(IREE_PLATFORM_WINDOWS) |
| case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT: |
| *out_cpu_time *= (uint64_t)(1000000000.0 / tracy::GetFrequencyQpc()); |
| break; |
| #else |
| case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT: |
| case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT: |
| // TODO(benvanik): posix calibrated timestamps - ignored for now. |
| break; |
| #endif // IREE_PLATFORM_WINDOWS |
| default: |
| break; |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| // Populates |out_cpu_time| and |out_gpu_time| with calibrated timestamps. |
| // Depending on whether VK_EXT_calibrated_timestamps is available this may be |
| // a guess done by ourselves (with lots of slop) or done by the driver (with |
| // less slop). |
| static void iree_hal_vulkan_tracing_perform_initial_calibration( |
| iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time, |
| uint64_t* out_gpu_time) { |
| const auto& syms = context->logical_device->syms(); |
| *out_cpu_time = 0; |
| *out_gpu_time = 0; |
| |
| IREE_TRACE_ZONE_BEGIN(z0); |
| IREE_TRACE_ZONE_APPEND_TEXT(z0, |
| context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT |
| ? "VK_TIME_DOMAIN_DEVICE_EXT" |
| : "VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT"); |
| |
| // Attempt to get a timestamp from both the device and the host at roughly the |
| // same time. There's a gap between when we get control returned to use after |
| // submitting and waiting for idle and that will be the slop we have in the |
| // timings in the tracy UI. |
| if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) { |
| // Submit a device timestamp. |
| VkCommandBuffer command_buffer = |
| iree_hal_vulkan_tracing_begin_command_buffer(context); |
| if (command_buffer != VK_NULL_HANDLE) { |
| syms->vkCmdWriteTimestamp(command_buffer, |
| VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, |
| context->query_pool, 0); |
| iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer); |
| } |
| |
| // Query the timestamp from the host and the device. |
| *out_cpu_time = tracy::Profiler::GetTime(); |
| syms->vkGetQueryPoolResults( |
| *context->logical_device, context->query_pool, 0, 1, |
| sizeof(*out_gpu_time), out_gpu_time, sizeof(*out_gpu_time), |
| VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); |
| |
| // Reset the query used. |
| iree_hal_vulkan_tracing_reset_query_pool(context, 0, 1); |
| IREE_TRACE_ZONE_END(z0); |
| return; |
| } |
| |
| // From the spec: |
| // The maximum deviation may vary between calls to |
| // vkGetCalibratedTimestampsEXT even for the same set of time domains due to |
| // implementation and platform specific reasons. It is the application’s |
| // responsibility to assess whether the returned maximum deviation makes the |
| // timestamp values suitable for any particular purpose and can choose to |
| // re-issue the timestamp calibration call pursuing a lower devation value. |
| // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/vkGetCalibratedTimestampsEXT.html |
| // |
| // We perform a small number of queries here and find the minimum deviation |
| // across all of them to get an average lower bound on the maximum deviation |
| // from any particular query. We then use that as our baseline (plus some |
| // slop) to see if calibration events in the future are reasonable. |
| VkCalibratedTimestampInfoEXT timestamp_infos[2]; |
| timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; |
| timestamp_infos[0].pNext = NULL; |
| timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT; |
| timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; |
| timestamp_infos[1].pNext = NULL; |
| timestamp_infos[1].timeDomain = context->time_domain; |
| uint64_t max_deviations[IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT]; |
| IREE_TRACE_ZONE_BEGIN_NAMED(z1, "vkGetCalibratedTimestampsEXT"); |
| for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(max_deviations); ++i) { |
| uint64_t timestamps[2] = {0, 0}; |
| syms->vkGetCalibratedTimestampsEXT( |
| *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos, |
| timestamps, &max_deviations[i]); |
| } |
| IREE_TRACE_ZONE_END(z1); |
| uint64_t min_deviation = max_deviations[0]; |
| for (iree_host_size_t i = 1; i < IREE_ARRAYSIZE(max_deviations); ++i) { |
| min_deviation = iree_min(min_deviation, max_deviations[i]); |
| } |
| context->max_expected_deviation = min_deviation * 3 / 2; |
| |
| iree_hal_vulkan_tracing_query_calibration_timestamps( |
| context, &context->previous_cpu_time, out_gpu_time); |
| *out_cpu_time = tracy::Profiler::GetTime(); |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| // Performs a periodic calibration (if supported) and sends the data to tracy. |
| // Over time the host and device clocks may drift (especially with power events) |
| // and by frequently performing this we ensure that the samples we are sending |
| // to tracy are able to be correlated. |
| void iree_hal_vulkan_tracing_perform_calibration( |
| iree_hal_vulkan_tracing_context_t* context) { |
| if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) return; |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| uint64_t cpu_time = 0; |
| uint64_t gpu_time = 0; |
| iree_hal_vulkan_tracing_query_calibration_timestamps(context, &cpu_time, |
| &gpu_time); |
| |
| uint64_t tracy_time = tracy::Profiler::GetTime(); |
| if (cpu_time > context->previous_cpu_time) { |
| uint64_t cpu_delta = cpu_time - context->previous_cpu_time; |
| context->previous_cpu_time = cpu_time; |
| auto* item = tracy::Profiler::QueueSerial(); |
| tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuCalibration); |
| tracy::MemWrite(&item->gpuCalibration.gpuTime, gpu_time); |
| tracy::MemWrite(&item->gpuCalibration.cpuTime, tracy_time); |
| tracy::MemWrite(&item->gpuCalibration.cpuDelta, cpu_delta); |
| tracy::MemWrite(&item->gpuCalibration.context, context->id); |
| tracy::Profiler::QueueSerialFinish(); |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| // Prepares the VkQueryPool backing storage for our query ringbuffer. |
| static void iree_hal_vulkan_tracing_prepare_query_pool( |
| iree_hal_vulkan_tracing_context_t* context) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| // Create a query pool with the largest query capacity it can provide. |
| VkQueryPoolCreateInfo pool_info; |
| memset(&pool_info, 0, sizeof(pool_info)); |
| pool_info.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; |
| pool_info.queryCount = IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY; |
| pool_info.queryType = VK_QUERY_TYPE_TIMESTAMP; |
| IREE_TRACE_ZONE_APPEND_VALUE(z0, pool_info.queryCount); |
| while (context->logical_device->syms()->vkCreateQueryPool( |
| *context->logical_device, &pool_info, |
| context->logical_device->allocator(), |
| &context->query_pool) != VK_SUCCESS) { |
| pool_info.queryCount /= 2; |
| IREE_TRACE_ZONE_APPEND_VALUE(z0, pool_info.queryCount); |
| } |
| context->query_capacity = pool_info.queryCount; |
| |
| // Perform initial reset of the query pool. All queries must be reset upon |
| // creation before first use. |
| iree_hal_vulkan_tracing_reset_query_pool(context, 0, context->query_capacity); |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| // Prepares the Tracy-related GPU context that events are fed into. Each context |
| // will appear as a unique plot in the tracy UI with the given |queue_name|. |
| static void iree_hal_vulkan_tracing_prepare_gpu_context( |
| iree_hal_vulkan_tracing_context_t* context, |
| VkPhysicalDevice physical_device, iree_string_view_t queue_name) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| // Allocate the process-unique GPU context ID. There's a max of 255 available; |
| // if we are recreating devices a lot we may exceed that. Don't do that, or |
| // wrap around and get weird (but probably still usable) numbers. |
| context->id = |
| tracy::GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed); |
| if (context->id >= 255) { |
| context->id %= 255; |
| } |
| |
| // The number of nanoseconds required for a timestamp query to be incremented |
| // by 1. |
| VkPhysicalDeviceProperties device_properties; |
| context->logical_device->syms()->vkGetPhysicalDeviceProperties( |
| physical_device, &device_properties); |
| float timestamp_period = device_properties.limits.timestampPeriod; |
| |
| // Perform initial calibration for tracy to be able to correlate timestamps |
| // between CPU and GPU. |
| uint64_t cpu_time = 0; |
| uint64_t gpu_time = 0; |
| iree_hal_vulkan_tracing_perform_initial_calibration(context, &cpu_time, |
| &gpu_time); |
| |
| uint8_t context_flags = 0; |
| if (context->time_domain != VK_TIME_DOMAIN_DEVICE_EXT) { |
| // Tell tracy we'll be passing calibrated timestamps and not to mess with |
| // the times. We'll periodically send GpuCalibration events in case the |
| // times drift. |
| context_flags |= tracy::GpuContextCalibration; |
| } |
| { |
| auto* item = tracy::Profiler::QueueSerial(); |
| tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuNewContext); |
| tracy::MemWrite(&item->gpuNewContext.cpuTime, cpu_time); |
| tracy::MemWrite(&item->gpuNewContext.gpuTime, gpu_time); |
| memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); |
| tracy::MemWrite(&item->gpuNewContext.period, timestamp_period); |
| tracy::MemWrite(&item->gpuNewContext.context, context->id); |
| tracy::MemWrite(&item->gpuNewContext.flags, context_flags); |
| tracy::MemWrite(&item->gpuNewContext.type, tracy::GpuContextType::Vulkan); |
| tracy::Profiler::QueueSerialFinish(); |
| } |
| |
| // Send the name of the context along. |
| // NOTE: Tracy will unconditionally free the name so we must clone it here. |
| // Since internally Tracy will use its own rpmalloc implementation we must |
| // make sure we allocate from the same source. |
| char* cloned_name = (char*)tracy::tracy_malloc(queue_name.size); |
| memcpy(cloned_name, queue_name.data, queue_name.size); |
| { |
| auto* item = tracy::Profiler::QueueSerial(); |
| tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuContextName); |
| tracy::MemWrite(&item->gpuContextNameFat.context, context->id); |
| tracy::MemWrite(&item->gpuContextNameFat.ptr, (uint64_t)cloned_name); |
| tracy::MemWrite(&item->gpuContextNameFat.size, queue_name.size); |
| tracy::Profiler::QueueSerialFinish(); |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| // Returns the best possible platform-supported time domain, falling back to |
| // VK_TIME_DOMAIN_DEVICE_EXT. By default it is one that is only usable for |
| // device-relative calculations and that we need to perform our own hacky |
| // calibration on. |
| static VkTimeDomainEXT iree_hal_vulkan_tracing_query_time_domain( |
| VkPhysicalDevice physical_device, |
| iree::hal::vulkan::VkDeviceHandle* logical_device) { |
| if (!logical_device->enabled_extensions().calibrated_timestamps) { |
| // Calibrated timestamps extension is not available; we'll only have the |
| // device domain. |
| return VK_TIME_DOMAIN_DEVICE_EXT; |
| } |
| |
| uint32_t time_domain_count = 0; |
| if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( |
| physical_device, &time_domain_count, NULL) != VK_SUCCESS) { |
| return VK_TIME_DOMAIN_DEVICE_EXT; |
| } |
| VkTimeDomainEXT* time_domains = (VkTimeDomainEXT*)iree_alloca( |
| time_domain_count * sizeof(VkTimeDomainEXT)); |
| if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( |
| physical_device, &time_domain_count, time_domains) != VK_SUCCESS) { |
| return VK_TIME_DOMAIN_DEVICE_EXT; |
| } |
| |
| for (uint32_t i = 0; i < time_domain_count; i++) { |
| switch (time_domains[i]) { |
| #if defined(IREE_PLATFORM_WINDOWS) |
| case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT: |
| return time_domains[i]; |
| #else |
| case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT: |
| case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT: |
| // TODO(benvanik): support posix clock domains with some kind of math. |
| // return time_domains[i]; -- ignored |
| #endif // IREE_PLATFORM_WINDOWS |
| default: |
| continue; |
| } |
| } |
| return VK_TIME_DOMAIN_DEVICE_EXT; |
| } |
| |
| iree_status_t iree_hal_vulkan_tracing_context_allocate( |
| VkPhysicalDevice physical_device, |
| iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue, |
| iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue, |
| iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool, |
| iree_allocator_t host_allocator, |
| iree_hal_vulkan_tracing_context_t** out_context) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| IREE_ASSERT_ARGUMENT(logical_device); |
| IREE_ASSERT_ARGUMENT(out_context); |
| *out_context = NULL; |
| |
| iree_hal_vulkan_tracing_context_t* context = NULL; |
| iree_status_t status = |
| iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context); |
| if (iree_status_is_ok(status)) { |
| context->logical_device = logical_device; |
| context->queue = queue; |
| context->host_allocator = host_allocator; |
| context->time_domain = iree_hal_vulkan_tracing_query_time_domain( |
| physical_device, logical_device); |
| context->maintenance_dispatch_queue = maintenance_dispatch_queue; |
| context->maintenance_command_pool = maintenance_command_pool; |
| |
| // Prepare the query pool and perform the initial calibration. |
| iree_hal_vulkan_tracing_prepare_query_pool(context); |
| |
| // Prepare the Tracy GPU context. |
| iree_hal_vulkan_tracing_prepare_gpu_context(context, physical_device, |
| queue_name); |
| } |
| |
| if (iree_status_is_ok(status)) { |
| *out_context = context; |
| } else { |
| iree_hal_vulkan_tracing_context_free(context); |
| } |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| void iree_hal_vulkan_tracing_context_free( |
| iree_hal_vulkan_tracing_context_t* context) { |
| if (!context) return; |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| if (context->query_pool != VK_NULL_HANDLE) { |
| // Always perform a collection on shutdown. |
| iree_hal_vulkan_tracing_context_collect(context, VK_NULL_HANDLE); |
| |
| auto* logical_device = context->logical_device; |
| logical_device->syms()->vkDestroyQueryPool( |
| *logical_device, context->query_pool, logical_device->allocator()); |
| } |
| |
| iree_allocator_t host_allocator = context->host_allocator; |
| iree_allocator_free(host_allocator, context); |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| uint32_t iree_hal_vulkan_tracing_context_acquire_query_id( |
| iree_hal_vulkan_tracing_context_t* context) { |
| uint32_t id = context->query_head; |
| context->query_head = (context->query_head + 1) % context->query_capacity; |
| assert(context->query_head != context->query_tail); |
| return id; |
| } |
| |
| void iree_hal_vulkan_tracing_context_collect( |
| iree_hal_vulkan_tracing_context_t* context, |
| VkCommandBuffer command_buffer) { |
| if (!context) return; |
| if (context->query_tail == context->query_head) { |
| // No outstanding queries. |
| return; |
| } |
| IREE_TRACE_ZONE_BEGIN(z0); |
| const auto& syms = context->logical_device->syms(); |
| |
| while (context->query_tail != context->query_head) { |
| // Compute the contiguous range of queries ready to be read. |
| // If the ringbuffer wraps around we'll handle that in the next loop. |
| uint32_t try_query_count = |
| context->query_head < context->query_tail |
| ? context->query_capacity - context->query_tail |
| : context->query_head - context->query_tail; |
| try_query_count = iree_min(try_query_count, |
| IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY); |
| |
| // Read back all of the queries. Note that we also are reading back the |
| // availability such that we can handle partial readiness of the outstanding |
| // range of queries. |
| uint32_t query_base = context->query_tail; |
| if (syms->vkGetQueryPoolResults( |
| *context->logical_device, context->query_pool, query_base, |
| try_query_count, sizeof(context->readback_buffer), |
| context->readback_buffer, sizeof(iree_hal_vulkan_timestamp_query_t), |
| VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) != |
| VK_SUCCESS) { |
| break; |
| } |
| |
| // Scan and feed the times to tracy, stopping when we hit the first |
| // unavailable query. |
| uint32_t read_query_count = 0; |
| for (uint32_t i = 0; i < try_query_count; ++i) { |
| if (context->readback_buffer[i].availability == 0) break; |
| read_query_count = i + 1; |
| auto* item = tracy::Profiler::QueueSerial(); |
| tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuTime); |
| tracy::MemWrite(&item->gpuTime.gpuTime, |
| context->readback_buffer[i].timestamp); |
| tracy::MemWrite(&item->gpuTime.queryId, (uint16_t)(query_base + i)); |
| tracy::MemWrite(&item->gpuTime.context, context->id); |
| tracy::Profiler::QueueSerialFinish(); |
| } |
| |
| // Reset the range of queries read back. |
| if (command_buffer != VK_NULL_HANDLE) { |
| syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_base, |
| read_query_count); |
| } else { |
| iree_hal_vulkan_tracing_reset_query_pool(context, query_base, |
| read_query_count); |
| } |
| |
| context->query_tail += read_query_count; |
| if (context->query_tail >= context->query_capacity) { |
| context->query_tail = 0; |
| } |
| } |
| |
| // Run calibration - we could do this less frequently in cases where collect |
| // is called every submission, however it's relatively cheap compared to all |
| // this other tracing overhead. |
| iree_hal_vulkan_tracing_perform_calibration(context); |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| void iree_hal_vulkan_tracing_zone_begin_impl( |
| iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, |
| const iree_tracing_location_t* src_loc) { |
| if (!context) return; |
| |
| uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context); |
| context->logical_device->syms()->vkCmdWriteTimestamp( |
| command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool, |
| query_id); |
| |
| auto* item = tracy::Profiler::QueueSerial(); |
| tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneBeginSerial); |
| tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime()); |
| tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc); |
| tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle()); |
| tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id); |
| tracy::MemWrite(&item->gpuZoneBegin.context, context->id); |
| tracy::Profiler::QueueSerialFinish(); |
| } |
| |
| void iree_hal_vulkan_tracing_zone_begin_external_impl( |
| iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, |
| const char* file_name, size_t file_name_length, uint32_t line, |
| const char* function_name, size_t function_name_length, const char* name, |
| size_t name_length) { |
| if (!context) return; |
| |
| uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context); |
| context->logical_device->syms()->vkCmdWriteTimestamp( |
| command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool, |
| query_id); |
| |
| const auto src_loc = tracy::Profiler::AllocSourceLocation( |
| line, file_name, file_name_length, function_name, function_name_length, |
| name, name_length); |
| auto* item = tracy::Profiler::QueueSerial(); |
| tracy::MemWrite(&item->hdr.type, |
| tracy::QueueType::GpuZoneBeginAllocSrcLocSerial); |
| tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime()); |
| tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc); |
| tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle()); |
| tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id); |
| tracy::MemWrite(&item->gpuZoneBegin.context, context->id); |
| tracy::Profiler::QueueSerialFinish(); |
| } |
| |
| void iree_hal_vulkan_tracing_zone_end_impl( |
| iree_hal_vulkan_tracing_context_t* context, |
| VkCommandBuffer command_buffer) { |
| if (!context) return; |
| |
| uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context); |
| context->logical_device->syms()->vkCmdWriteTimestamp( |
| command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool, |
| query_id); |
| |
| auto* item = tracy::Profiler::QueueSerial(); |
| tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneEndSerial); |
| tracy::MemWrite(&item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime()); |
| tracy::MemWrite(&item->gpuZoneEnd.thread, tracy::GetThreadHandle()); |
| tracy::MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)query_id); |
| tracy::MemWrite(&item->gpuZoneEnd.context, context->id); |
| tracy::Profiler::QueueSerialFinish(); |
| } |
| |
| #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION |