runtime/src/iree/hal/drivers/vulkan/tracing.cc - 3p/openxla/iree - Git at Google

 // Copyright 2021 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/hal/drivers/vulkan/tracing.h"

 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION

 #include "iree/base/api.h"
 #include "iree/base/target_platform.h"
 #include "third_party/tracy/Tracy.hpp"
 #include "third_party/tracy/client/TracyProfiler.hpp"
 #include "third_party/tracy/common/TracyAlloc.hpp"

 // Total number of queries the per-queue query pool will contain. This
 // translates to the maximum number of outstanding queries before collection is
 // required.
 #define IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY (32 * 1024)

 // Total number of queries that can be read back from the API in a single
 // collection.
 #define IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY (8 * 1024)

 // Number of times we will query the max_deviation from calibrated timestamps.
 // The more we do the better confidence we have in a lower-bound.
 #define IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT 32

 typedef struct iree_hal_vulkan_timestamp_query_t {
   uint64_t timestamp;
   uint64_t availability;  // non-zero if available
 } iree_hal_vulkan_timestamp_query_t;

 struct iree_hal_vulkan_tracing_context_t {
   // Device and queue the context represents.
   iree::hal::vulkan::VkDeviceHandle* logical_device;
   VkQueue queue;
   iree_allocator_t host_allocator;

   // Maintenance queue that supports dispatch commands and can be used to reset
   // queries.
   VkQueue maintenance_dispatch_queue;
   // Command pool that serves command buffers compatible with the
   // |maintenance_dispatch_queue|.
   iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool;

   // A unique GPU zone ID allocated from Tracy.
   // There is a global limit of 255 GPU zones (ID 255 is special).
   uint8_t id;

   // Defines how the timestamps are interpreted (device-specific, posix, QPC).
   // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkTimeDomainEXT.html
   VkTimeDomainEXT time_domain;

   // Maximum expected deviation between CPU and GPU timestamps based on an
   // average computed at startup. Calibration events that exceed this value are
   // discarded.
   uint64_t max_expected_deviation;

   // Vulkan-reported CPU timestamp of the last calibration.
   // Used to detect when drift occurs and we need to notify tracy.
   uint64_t previous_cpu_time;

   // Pool of query instances that we treat as a backing store for a ringbuffer.
   VkQueryPool query_pool;

   // Indices into |query_pool| defining a ringbuffer.
   uint32_t query_head;
   uint32_t query_tail;
   uint32_t query_capacity;

   // Readback storage; large enough to get a decent chunk of queries back from
   // the API in one shot.
   //
   // Data is stored as [[timestamp, availability], ...].
   // Availability will be non-zero if the timestamp is valid. Since we put all
   // timestamps in order once we reach an unavailable timestamp we can bail
   // and leave that for future collections.
   iree_hal_vulkan_timestamp_query_t
       readback_buffer[IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY];
 };

 // Allocates and begins a command buffer and returns its handle.
 // Returns VK_NULL_HANDLE if allocation fails.
 static VkCommandBuffer iree_hal_vulkan_tracing_begin_command_buffer(
     iree_hal_vulkan_tracing_context_t* context) {
   const auto& syms = context->logical_device->syms();

   VkCommandBufferAllocateInfo command_buffer_info;
   memset(&command_buffer_info, 0, sizeof(command_buffer_info));
   command_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
   command_buffer_info.commandPool = *context->maintenance_command_pool;
   command_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
   command_buffer_info.commandBufferCount = 1;
   VkCommandBuffer command_buffer = VK_NULL_HANDLE;
   IREE_IGNORE_ERROR(context->maintenance_command_pool->Allocate(
       &command_buffer_info, &command_buffer));
   if (!command_buffer) return VK_NULL_HANDLE;

   VkCommandBufferBeginInfo begin_info;
   memset(&begin_info, 0, sizeof(begin_info));
   begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
   begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
   syms->vkBeginCommandBuffer(command_buffer, &begin_info);

   return command_buffer;
 }

 // Ends and submits |command_buffer| and waits for it to complete.
 static void iree_hal_vulkan_tracing_submit_command_buffer(
     iree_hal_vulkan_tracing_context_t* context,
     VkCommandBuffer command_buffer) {
   const auto& syms = context->logical_device->syms();

   syms->vkEndCommandBuffer(command_buffer);

   VkSubmitInfo submit_info;
   memset(&submit_info, 0, sizeof(submit_info));
   submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
   submit_info.commandBufferCount = 1;
   submit_info.pCommandBuffers = &command_buffer;
   syms->vkQueueSubmit(context->maintenance_dispatch_queue, 1, &submit_info,
                       VK_NULL_HANDLE);
   syms->vkQueueWaitIdle(context->maintenance_dispatch_queue);

   context->maintenance_command_pool->Free(command_buffer);
 }

 // Synchronously resets a range of querys in a query pool.
 // This may submit commands to the queue.
 static void iree_hal_vulkan_tracing_reset_query_pool(
     iree_hal_vulkan_tracing_context_t* context, uint32_t query_index,
     uint32_t query_count) {
   const auto& syms = context->logical_device->syms();

   // Fast-path for when host-side vkResetQueryPool is available.
   // This is core in Vulkan 1.2.
   if (context->logical_device->enabled_extensions().host_query_reset) {
     PFN_vkResetQueryPool vkResetQueryPool_fn = syms->vkResetQueryPool
                                                    ? syms->vkResetQueryPool
                                                    : syms->vkResetQueryPoolEXT;
     if (vkResetQueryPool_fn != NULL) {
       vkResetQueryPool_fn(*context->logical_device, context->query_pool,
                           query_index, query_count);
       return;
     }
   }

   // Slow-path submitting a command buffer to reset the query pool. It's obvious
   // why vkResetQueryPool was added :)
   VkCommandBuffer command_buffer =
       iree_hal_vulkan_tracing_begin_command_buffer(context);
   if (command_buffer != VK_NULL_HANDLE) {
     syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_index,
                               query_count);
     iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer);
   }
 }

 // Attempts to get a timestamp from both the CPU and GPU that are correlated
 // with each other. Only valid when calibration is supported.
 static void iree_hal_vulkan_tracing_query_calibration_timestamps(
     iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time,
     uint64_t* out_gpu_time) {
   IREE_TRACE_ZONE_BEGIN(z0);
   *out_cpu_time = 0;
   *out_gpu_time = 0;

   VkCalibratedTimestampInfoEXT timestamp_infos[2];
   timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
   timestamp_infos[0].pNext = NULL;
   timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
   timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
   timestamp_infos[1].pNext = NULL;
   timestamp_infos[1].timeDomain = context->time_domain;
   uint64_t timestamps[2] = {0, 0};
   uint64_t max_deviation = 0;
   do {
     context->logical_device->syms()->vkGetCalibratedTimestampsEXT(
         *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos,
         timestamps, &max_deviation);
   } while (max_deviation > context->max_expected_deviation);

   *out_gpu_time = timestamps[0];
   *out_cpu_time = timestamps[1];
   switch (context->time_domain) {
 #if defined(IREE_PLATFORM_WINDOWS)
     case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT:
       *out_cpu_time *= (uint64_t)(1000000000.0 / tracy::GetFrequencyQpc());
       break;
 #else
     case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
     case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
       // TODO(benvanik): posix calibrated timestamps - ignored for now.
       break;
 #endif  // IREE_PLATFORM_WINDOWS
     default:
       break;
   }

   IREE_TRACE_ZONE_END(z0);
 }

 // Populates |out_cpu_time| and |out_gpu_time| with calibrated timestamps.
 // Depending on whether VK_EXT_calibrated_timestamps is available this may be
 // a guess done by ourselves (with lots of slop) or done by the driver (with
 // less slop).
 static void iree_hal_vulkan_tracing_perform_initial_calibration(
     iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time,
     uint64_t* out_gpu_time) {
   const auto& syms = context->logical_device->syms();
   *out_cpu_time = 0;
   *out_gpu_time = 0;

   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_TEXT(z0,
                               context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT
                                   ? "VK_TIME_DOMAIN_DEVICE_EXT"
                                   : "VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT");

   // Attempt to get a timestamp from both the device and the host at roughly the
   // same time. There's a gap between when we get control returned to use after
   // submitting and waiting for idle and that will be the slop we have in the
   // timings in the tracy UI.
   if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) {
     // Submit a device timestamp.
     VkCommandBuffer command_buffer =
         iree_hal_vulkan_tracing_begin_command_buffer(context);
     if (command_buffer != VK_NULL_HANDLE) {
       syms->vkCmdWriteTimestamp(command_buffer,
                                 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                                 context->query_pool, 0);
       iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer);
     }

     // Query the timestamp from the host and the device.
     *out_cpu_time = tracy::Profiler::GetTime();
     syms->vkGetQueryPoolResults(
         *context->logical_device, context->query_pool, 0, 1,
         sizeof(*out_gpu_time), out_gpu_time, sizeof(*out_gpu_time),
         VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);

     // Reset the query used.
     iree_hal_vulkan_tracing_reset_query_pool(context, 0, 1);
     IREE_TRACE_ZONE_END(z0);
     return;
   }

   // From the spec:
   // The maximum deviation may vary between calls to
   // vkGetCalibratedTimestampsEXT even for the same set of time domains due to
   // implementation and platform specific reasons. It is the application’s
   // responsibility to assess whether the returned maximum deviation makes the
   // timestamp values suitable for any particular purpose and can choose to
   // re-issue the timestamp calibration call pursuing a lower devation value.
   // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/vkGetCalibratedTimestampsEXT.html
   //
   // We perform a small number of queries here and find the minimum deviation
   // across all of them to get an average lower bound on the maximum deviation
   // from any particular query. We then use that as our baseline (plus some
   // slop) to see if calibration events in the future are reasonable.
   VkCalibratedTimestampInfoEXT timestamp_infos[2];
   timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
   timestamp_infos[0].pNext = NULL;
   timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
   timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
   timestamp_infos[1].pNext = NULL;
   timestamp_infos[1].timeDomain = context->time_domain;
   uint64_t max_deviations[IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT];
   IREE_TRACE_ZONE_BEGIN_NAMED(z1, "vkGetCalibratedTimestampsEXT");
   for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(max_deviations); ++i) {
     uint64_t timestamps[2] = {0, 0};
     syms->vkGetCalibratedTimestampsEXT(
         *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos,
         timestamps, &max_deviations[i]);
   }
   IREE_TRACE_ZONE_END(z1);
   uint64_t min_deviation = max_deviations[0];
   for (iree_host_size_t i = 1; i < IREE_ARRAYSIZE(max_deviations); ++i) {
     min_deviation = iree_min(min_deviation, max_deviations[i]);
   }
   context->max_expected_deviation = min_deviation * 3 / 2;

   iree_hal_vulkan_tracing_query_calibration_timestamps(
       context, &context->previous_cpu_time, out_gpu_time);
   *out_cpu_time = tracy::Profiler::GetTime();

   IREE_TRACE_ZONE_END(z0);
 }

 // Performs a periodic calibration (if supported) and sends the data to tracy.
 // Over time the host and device clocks may drift (especially with power events)
 // and by frequently performing this we ensure that the samples we are sending
 // to tracy are able to be correlated.
 void iree_hal_vulkan_tracing_perform_calibration(
     iree_hal_vulkan_tracing_context_t* context) {
   if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) return;
   IREE_TRACE_ZONE_BEGIN(z0);

   uint64_t cpu_time = 0;
   uint64_t gpu_time = 0;
   iree_hal_vulkan_tracing_query_calibration_timestamps(context, &cpu_time,
                                                        &gpu_time);

   uint64_t tracy_time = tracy::Profiler::GetTime();
   if (cpu_time > context->previous_cpu_time) {
     uint64_t cpu_delta = cpu_time - context->previous_cpu_time;
     context->previous_cpu_time = cpu_time;
     auto* item = tracy::Profiler::QueueSerial();
     tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuCalibration);
     tracy::MemWrite(&item->gpuCalibration.gpuTime, gpu_time);
     tracy::MemWrite(&item->gpuCalibration.cpuTime, tracy_time);
     tracy::MemWrite(&item->gpuCalibration.cpuDelta, cpu_delta);
     tracy::MemWrite(&item->gpuCalibration.context, context->id);
     tracy::Profiler::QueueSerialFinish();
   }

   IREE_TRACE_ZONE_END(z0);
 }

 // Prepares the VkQueryPool backing storage for our query ringbuffer.
 static void iree_hal_vulkan_tracing_prepare_query_pool(
     iree_hal_vulkan_tracing_context_t* context) {
   IREE_TRACE_ZONE_BEGIN(z0);

   // Create a query pool with the largest query capacity it can provide.
   VkQueryPoolCreateInfo pool_info;
   memset(&pool_info, 0, sizeof(pool_info));
   pool_info.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
   pool_info.queryCount = IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY;
   pool_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
   IREE_TRACE_ZONE_APPEND_VALUE(z0, pool_info.queryCount);
   while (context->logical_device->syms()->vkCreateQueryPool(
              *context->logical_device, &pool_info,
              context->logical_device->allocator(),
              &context->query_pool) != VK_SUCCESS) {
     pool_info.queryCount /= 2;
     IREE_TRACE_ZONE_APPEND_VALUE(z0, pool_info.queryCount);
   }
   context->query_capacity = pool_info.queryCount;

   // Perform initial reset of the query pool. All queries must be reset upon
   // creation before first use.
   iree_hal_vulkan_tracing_reset_query_pool(context, 0, context->query_capacity);

   IREE_TRACE_ZONE_END(z0);
 }

 // Prepares the Tracy-related GPU context that events are fed into. Each context
 // will appear as a unique plot in the tracy UI with the given |queue_name|.
 static void iree_hal_vulkan_tracing_prepare_gpu_context(
     iree_hal_vulkan_tracing_context_t* context,
     VkPhysicalDevice physical_device, iree_string_view_t queue_name) {
   IREE_TRACE_ZONE_BEGIN(z0);

   // Allocate the process-unique GPU context ID. There's a max of 255 available;
   // if we are recreating devices a lot we may exceed that. Don't do that, or
   // wrap around and get weird (but probably still usable) numbers.
   context->id =
       tracy::GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed);
   if (context->id >= 255) {
     context->id %= 255;
   }

   // The number of nanoseconds required for a timestamp query to be incremented
   // by 1.
   VkPhysicalDeviceProperties device_properties;
   context->logical_device->syms()->vkGetPhysicalDeviceProperties(
       physical_device, &device_properties);
   float timestamp_period = device_properties.limits.timestampPeriod;

   // Perform initial calibration for tracy to be able to correlate timestamps
   // between CPU and GPU.
   uint64_t cpu_time = 0;
   uint64_t gpu_time = 0;
   iree_hal_vulkan_tracing_perform_initial_calibration(context, &cpu_time,
                                                       &gpu_time);

   uint8_t context_flags = 0;
   if (context->time_domain != VK_TIME_DOMAIN_DEVICE_EXT) {
     // Tell tracy we'll be passing calibrated timestamps and not to mess with
     // the times. We'll periodically send GpuCalibration events in case the
     // times drift.
     context_flags |= tracy::GpuContextCalibration;
   }
   {
     auto* item = tracy::Profiler::QueueSerial();
     tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuNewContext);
     tracy::MemWrite(&item->gpuNewContext.cpuTime, cpu_time);
     tracy::MemWrite(&item->gpuNewContext.gpuTime, gpu_time);
     memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
     tracy::MemWrite(&item->gpuNewContext.period, timestamp_period);
     tracy::MemWrite(&item->gpuNewContext.context, context->id);
     tracy::MemWrite(&item->gpuNewContext.flags, context_flags);
     tracy::MemWrite(&item->gpuNewContext.type, tracy::GpuContextType::Vulkan);
     tracy::Profiler::QueueSerialFinish();
   }

   // Send the name of the context along.
   // NOTE: Tracy will unconditionally free the name so we must clone it here.
   // Since internally Tracy will use its own rpmalloc implementation we must
   // make sure we allocate from the same source.
   char* cloned_name = (char*)tracy::tracy_malloc(queue_name.size);
   memcpy(cloned_name, queue_name.data, queue_name.size);
   {
     auto* item = tracy::Profiler::QueueSerial();
     tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuContextName);
     tracy::MemWrite(&item->gpuContextNameFat.context, context->id);
     tracy::MemWrite(&item->gpuContextNameFat.ptr, (uint64_t)cloned_name);
     tracy::MemWrite(&item->gpuContextNameFat.size, queue_name.size);
     tracy::Profiler::QueueSerialFinish();
   }

   IREE_TRACE_ZONE_END(z0);
 }

 // Returns the best possible platform-supported time domain, falling back to
 // VK_TIME_DOMAIN_DEVICE_EXT. By default it is one that is only usable for
 // device-relative calculations and that we need to perform our own hacky
 // calibration on.
 static VkTimeDomainEXT iree_hal_vulkan_tracing_query_time_domain(
     VkPhysicalDevice physical_device,
     iree::hal::vulkan::VkDeviceHandle* logical_device) {
   if (!logical_device->enabled_extensions().calibrated_timestamps) {
     // Calibrated timestamps extension is not available; we'll only have the
     // device domain.
     return VK_TIME_DOMAIN_DEVICE_EXT;
   }

   uint32_t time_domain_count = 0;
   if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT(
           physical_device, &time_domain_count, NULL) != VK_SUCCESS) {
     return VK_TIME_DOMAIN_DEVICE_EXT;
   }
   VkTimeDomainEXT* time_domains = (VkTimeDomainEXT*)iree_alloca(
       time_domain_count * sizeof(VkTimeDomainEXT));
   if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT(
           physical_device, &time_domain_count, time_domains) != VK_SUCCESS) {
     return VK_TIME_DOMAIN_DEVICE_EXT;
   }

   for (uint32_t i = 0; i < time_domain_count; i++) {
     switch (time_domains[i]) {
 #if defined(IREE_PLATFORM_WINDOWS)
       case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT:
         return time_domains[i];
 #else
       case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
       case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
         // TODO(benvanik): support posix clock domains with some kind of math.
         // return time_domains[i];  -- ignored
 #endif  // IREE_PLATFORM_WINDOWS
       default:
         continue;
     }
   }
   return VK_TIME_DOMAIN_DEVICE_EXT;
 }

 iree_status_t iree_hal_vulkan_tracing_context_allocate(
     VkPhysicalDevice physical_device,
     iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue,
     iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue,
     iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool,
     iree_allocator_t host_allocator,
     iree_hal_vulkan_tracing_context_t** out_context) {
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_ASSERT_ARGUMENT(logical_device);
   IREE_ASSERT_ARGUMENT(out_context);
   *out_context = NULL;

   iree_hal_vulkan_tracing_context_t* context = NULL;
   iree_status_t status =
       iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context);
   if (iree_status_is_ok(status)) {
     context->logical_device = logical_device;
     context->queue = queue;
     context->host_allocator = host_allocator;
     context->time_domain = iree_hal_vulkan_tracing_query_time_domain(
         physical_device, logical_device);
     context->maintenance_dispatch_queue = maintenance_dispatch_queue;
     context->maintenance_command_pool = maintenance_command_pool;

     // Prepare the query pool and perform the initial calibration.
     iree_hal_vulkan_tracing_prepare_query_pool(context);

     // Prepare the Tracy GPU context.
     iree_hal_vulkan_tracing_prepare_gpu_context(context, physical_device,
                                                 queue_name);
   }

   if (iree_status_is_ok(status)) {
     *out_context = context;
   } else {
     iree_hal_vulkan_tracing_context_free(context);
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 void iree_hal_vulkan_tracing_context_free(
     iree_hal_vulkan_tracing_context_t* context) {
   if (!context) return;
   IREE_TRACE_ZONE_BEGIN(z0);

   if (context->query_pool != VK_NULL_HANDLE) {
     // Always perform a collection on shutdown.
     iree_hal_vulkan_tracing_context_collect(context, VK_NULL_HANDLE);

     auto* logical_device = context->logical_device;
     logical_device->syms()->vkDestroyQueryPool(
         *logical_device, context->query_pool, logical_device->allocator());
   }

   iree_allocator_t host_allocator = context->host_allocator;
   iree_allocator_free(host_allocator, context);

   IREE_TRACE_ZONE_END(z0);
 }

 uint32_t iree_hal_vulkan_tracing_context_acquire_query_id(
     iree_hal_vulkan_tracing_context_t* context) {
   uint32_t id = context->query_head;
   context->query_head = (context->query_head + 1) % context->query_capacity;
   assert(context->query_head != context->query_tail);
   return id;
 }

 void iree_hal_vulkan_tracing_context_collect(
     iree_hal_vulkan_tracing_context_t* context,
     VkCommandBuffer command_buffer) {
   if (!context) return;
   if (context->query_tail == context->query_head) {
     // No outstanding queries.
     return;
   }
   IREE_TRACE_ZONE_BEGIN(z0);
   const auto& syms = context->logical_device->syms();

   while (context->query_tail != context->query_head) {
     // Compute the contiguous range of queries ready to be read.
     // If the ringbuffer wraps around we'll handle that in the next loop.
     uint32_t try_query_count =
         context->query_head < context->query_tail
             ? context->query_capacity - context->query_tail
             : context->query_head - context->query_tail;
     try_query_count = iree_min(try_query_count,
                                IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY);

     // Read back all of the queries. Note that we also are reading back the
     // availability such that we can handle partial readiness of the outstanding
     // range of queries.
     uint32_t query_base = context->query_tail;
     if (syms->vkGetQueryPoolResults(
             *context->logical_device, context->query_pool, query_base,
             try_query_count, sizeof(context->readback_buffer),
             context->readback_buffer, sizeof(iree_hal_vulkan_timestamp_query_t),
             VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) !=
         VK_SUCCESS) {
       break;
     }

     // Scan and feed the times to tracy, stopping when we hit the first
     // unavailable query.
     uint32_t read_query_count = 0;
     for (uint32_t i = 0; i < try_query_count; ++i) {
       if (context->readback_buffer[i].availability == 0) break;
       read_query_count = i + 1;
       auto* item = tracy::Profiler::QueueSerial();
       tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuTime);
       tracy::MemWrite(&item->gpuTime.gpuTime,
                       context->readback_buffer[i].timestamp);
       tracy::MemWrite(&item->gpuTime.queryId, (uint16_t)(query_base + i));
       tracy::MemWrite(&item->gpuTime.context, context->id);
       tracy::Profiler::QueueSerialFinish();
     }

     // Reset the range of queries read back.
     if (command_buffer != VK_NULL_HANDLE) {
       syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_base,
                                 read_query_count);
     } else {
       iree_hal_vulkan_tracing_reset_query_pool(context, query_base,
                                                read_query_count);
     }

     context->query_tail += read_query_count;
     if (context->query_tail >= context->query_capacity) {
       context->query_tail = 0;
     }
   }

   // Run calibration - we could do this less frequently in cases where collect
   // is called every submission, however it's relatively cheap compared to all
   // this other tracing overhead.
   iree_hal_vulkan_tracing_perform_calibration(context);

   IREE_TRACE_ZONE_END(z0);
 }

 void iree_hal_vulkan_tracing_zone_begin_impl(
     iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer,
     const iree_tracing_location_t* src_loc) {
   if (!context) return;

   uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context);
   context->logical_device->syms()->vkCmdWriteTimestamp(
       command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool,
       query_id);

   auto* item = tracy::Profiler::QueueSerial();
   tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneBeginSerial);
   tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime());
   tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc);
   tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle());
   tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id);
   tracy::MemWrite(&item->gpuZoneBegin.context, context->id);
   tracy::Profiler::QueueSerialFinish();
 }

 void iree_hal_vulkan_tracing_zone_begin_external_impl(
     iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer,
     const char* file_name, size_t file_name_length, uint32_t line,
     const char* function_name, size_t function_name_length, const char* name,
     size_t name_length) {
   if (!context) return;

   uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context);
   context->logical_device->syms()->vkCmdWriteTimestamp(
       command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool,
       query_id);

   const auto src_loc = tracy::Profiler::AllocSourceLocation(
       line, file_name, file_name_length, function_name, function_name_length,
       name, name_length);
   auto* item = tracy::Profiler::QueueSerial();
   tracy::MemWrite(&item->hdr.type,
                   tracy::QueueType::GpuZoneBeginAllocSrcLocSerial);
   tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime());
   tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc);
   tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle());
   tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id);
   tracy::MemWrite(&item->gpuZoneBegin.context, context->id);
   tracy::Profiler::QueueSerialFinish();
 }

 void iree_hal_vulkan_tracing_zone_end_impl(
     iree_hal_vulkan_tracing_context_t* context,
     VkCommandBuffer command_buffer) {
   if (!context) return;

   uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context);
   context->logical_device->syms()->vkCmdWriteTimestamp(
       command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool,
       query_id);

   auto* item = tracy::Profiler::QueueSerial();
   tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneEndSerial);
   tracy::MemWrite(&item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime());
   tracy::MemWrite(&item->gpuZoneEnd.thread, tracy::GetThreadHandle());
   tracy::MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)query_id);
   tracy::MemWrite(&item->gpuZoneEnd.context, context->id);
   tracy::Profiler::QueueSerialFinish();
 }

 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION