runtime/src/iree/hal/drivers/cuda/tracing.c - 3p/openxla/iree - Git at Google

 // Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/hal/drivers/cuda/tracing.h"

 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE

 #include "iree/hal/drivers/cuda/cuda_dynamic_symbols.h"
 #include "iree/hal/drivers/cuda/cuda_status_util.h"

 // Total number of events per tracing context. This translates to the maximum
 // number of outstanding timestamp queries before collection is required.
 // To prevent spilling pages we leave some room for the context structure.
 #define IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256)

 // iree_hal_cuda_tracing_context_event_t contains a cuEvent that is used to
 // record timestamps for tracing GPU execution. In this struct, there are also
 // two linked lists that the current event may be added to during its lifetime.
 //
 // --------------------->---Submissions--->----------
 // \                     \                    \
 //  \                     \                    \
 // command_buffer        command_buffer          command_buffer
 //
 // The submission list is owned by the tracing context and elements are
 // inserted and removed as commmand_buffers are submitted and when they
 // complete. This is a list of the head elements for each command buffer.
 // The commnad buffer list is owned by the command buffer. It is the list of
 // events used to trace command buffer dispatches.
 //
 // When the event is in the freelist, next_submission should be null, and
 // we reuse next_in_command_buffer to track the next free event.
 //
 // When the even is grabbed from the freelist to track GPU executions,
 // it is added to the list in recording command_buffer.
 struct iree_hal_cuda_tracing_context_event_t {
   CUevent event;
   iree_hal_cuda_tracing_context_event_t* next_in_command_buffer;
   iree_hal_cuda_tracing_context_event_t* next_submission;
   bool was_submitted;
 };

 struct iree_hal_cuda_tracing_context_t {
   const iree_hal_cuda_dynamic_symbols_t* symbols;
   iree_slim_mutex_t event_mutex;

   CUstream stream;
   iree_arena_block_pool_t* block_pool;
   iree_allocator_t host_allocator;

   // A unique GPU zone ID allocated from Tracy.
   // There is a global limit of 255 GPU zones (ID 255 is special).
   uint8_t id;

   // Base event used for computing relative times for all recorded events.
   // This is required as CUDA (without CUPTI) only allows for relative timing
   // between events and we need a stable base event.
   CUevent base_event;

   // Unallocated event list head. next_in_command_buffer points to the next
   // available event.
   iree_hal_cuda_tracing_context_event_t* event_freelist_head;

   // Submitted events.
   iree_hal_cuda_tracing_context_event_list_t submitted_event_list;

   uint32_t query_capacity;

   iree_hal_cuda_tracing_verbosity_t verbosity;

   // Event pool reused to capture tracing timestamps.
   // The lifetime of the events are as follows.
   // 1) All events are allocated when the tracing context is created.
   // 2) When a command_buffer inserts a query via:
   //    iree_hal_cuda_**_tracing_context_insert_query
   //    an event is pulled from the event freelist and added to the
   //    command buffer.
   // 3) When a command buffer is dispatched and
   //    iree_hal_cuda_tracing_notify_submitted is called, the events
   //    for that command buffer are added to the submitted_event_list.
   // 4) When the command buffer completes iree_hal_cuda_tracing_context_collect
   //    is called, and the events are removed from submitted_event_list as
   //    we collect their values.
   // 5) When the command buffer is destroyed, all events are put at the front
   //    of event_freelist.
   iree_hal_cuda_tracing_context_event_t
       event_pool[IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY];
 };

 static iree_status_t iree_hal_cuda_tracing_context_initial_calibration(
     const iree_hal_cuda_dynamic_symbols_t* symbols, CUstream stream,
     CUevent base_event, int64_t* out_cpu_timestamp, int64_t* out_gpu_timestamp,
     float* out_timestamp_period) {
   IREE_TRACE_ZONE_BEGIN(z0);
   *out_cpu_timestamp = 0;
   *out_gpu_timestamp = 0;
   *out_timestamp_period = 1.0f;

   // Record event to the stream; in the absence of a synchronize this may not
   // flush immediately.
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, IREE_CURESULT_TO_STATUS(symbols, cuEventRecord(base_event, stream)));

   // Force flush the event and wait for it to complete.
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, IREE_CURESULT_TO_STATUS(symbols, cuEventSynchronize(base_event)));

   // Track when we know the event has completed and has a reasonable timestamp.
   // This may drift from the actual time differential between host/device but is
   // (maybe?) the best we can do without CUPTI.
   *out_cpu_timestamp = iree_tracing_time();

   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }

 iree_status_t iree_hal_cuda_tracing_context_allocate(
     const iree_hal_cuda_dynamic_symbols_t* symbols,
     iree_string_view_t queue_name, CUstream stream,
     iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_cuda_tracing_context_t** out_context) {
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_ASSERT_ARGUMENT(symbols);
   IREE_ASSERT_ARGUMENT(stream);
   IREE_ASSERT_ARGUMENT(block_pool);
   IREE_ASSERT_ARGUMENT(out_context);
   *out_context = NULL;

   iree_hal_cuda_tracing_context_t* context = NULL;
   iree_status_t status =
       iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context);
   if (iree_status_is_ok(status)) {
     context->symbols = symbols;
     context->stream = stream;
     context->block_pool = block_pool;
     context->host_allocator = host_allocator;
     context->query_capacity = IREE_ARRAYSIZE(context->event_pool);
     context->submitted_event_list.head = NULL;
     context->submitted_event_list.tail = NULL;
     context->verbosity = stream_tracing_verbosity;
     iree_slim_mutex_initialize(&context->event_mutex);
   }

   // Pre-allocate all events in the event pool.
   if (iree_status_is_ok(status)) {
     IREE_TRACE_ZONE_BEGIN_NAMED(
         z_event_pool, "iree_hal_cuda_tracing_context_allocate_event_pool");
     IREE_TRACE_ZONE_APPEND_VALUE_I64(z_event_pool,
                                      (int64_t)context->query_capacity);
     context->event_freelist_head = &context->event_pool[0];
     for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
       status = IREE_CURESULT_TO_STATUS(
           symbols,
           cuEventCreate(&context->event_pool[i].event, CU_EVENT_DEFAULT));
       if (!iree_status_is_ok(status)) break;
       if (i > 0) {
         context->event_pool[i - 1].next_in_command_buffer =
             &context->event_pool[i];
       }
       context->event_pool[i].next_submission = NULL;
       context->event_pool[i].was_submitted = false;
       if (i + 1 == context->query_capacity) {
         context->event_pool[i].next_in_command_buffer = NULL;
       }
     }
     IREE_TRACE_ZONE_END(z_event_pool);
   }

   // Create the initial GPU event and insert it into the stream.
   // All events we record are relative to this event.
   int64_t cpu_timestamp = 0;
   int64_t gpu_timestamp = 0;
   float timestamp_period = 0.0f;
   if (iree_status_is_ok(status)) {
     status = IREE_CURESULT_TO_STATUS(
         symbols, cuEventCreate(&context->base_event, CU_EVENT_DEFAULT));
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_cuda_tracing_context_initial_calibration(
         symbols, stream, context->base_event, &cpu_timestamp, &gpu_timestamp,
         &timestamp_period);
   }

   // Allocate the GPU context and pass initial calibration data.
   if (iree_status_is_ok(status)) {
     context->id = iree_tracing_gpu_context_allocate(
         IREE_TRACING_GPU_CONTEXT_TYPE_VULKAN, queue_name.data, queue_name.size,
         /*is_calibrated=*/false, cpu_timestamp, gpu_timestamp,
         timestamp_period);
   }

   if (iree_status_is_ok(status)) {
     *out_context = context;
   } else {
     iree_hal_cuda_tracing_context_free(context);
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 void iree_hal_cuda_tracing_context_free(
     iree_hal_cuda_tracing_context_t* context) {
   if (!context) return;
   IREE_TRACE_ZONE_BEGIN(z0);

   // Always perform a collection on shutdown.
   iree_hal_cuda_tracing_context_collect(context);

   // Release all events; since collection completed they should all be unused.
   IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool,
                               "iree_hal_cuda_tracing_context_free_event_pool");
   for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
     if (context->event_pool[i].event) {
       IREE_CUDA_IGNORE_ERROR(context->symbols,
                              cuEventDestroy(context->event_pool[i].event));
     }
   }
   IREE_TRACE_ZONE_END(z_event_pool);
   if (context->base_event) {
     IREE_CUDA_IGNORE_ERROR(context->symbols,
                            cuEventDestroy(context->base_event));
   }

   iree_slim_mutex_deinitialize(&context->event_mutex);

   iree_allocator_t host_allocator = context->host_allocator;
   iree_allocator_free(host_allocator, context);

   IREE_TRACE_ZONE_END(z0);
 }

 void iree_hal_cuda_tracing_context_collect(
     iree_hal_cuda_tracing_context_t* context) {
   if (!context) return;
   iree_slim_mutex_lock(&context->event_mutex);

   // No outstanding queries
   if (!context->submitted_event_list.head) {
     iree_slim_mutex_unlock(&context->event_mutex);
     return;
   }
   IREE_TRACE_ZONE_BEGIN(z0);
   // submitted_event_list is a list of the head elements for each command
   // buffer that has been submitted. Here we loop over all of the events,
   // wait for them to complete and gather the results with cuEventQuery.

   iree_hal_cuda_tracing_context_event_t* events =
       context->submitted_event_list.head;
   uint32_t read_query_count = 0;
   // Outer per-command_buffer loop.
   while (events) {
     iree_hal_cuda_tracing_context_event_t* event = events;
     // Inner per-event loop.
     while (event) {
       uint32_t query_id = (uint32_t)(event - &context->event_pool[0]);

       CUresult result = context->symbols->cuEventSynchronize(event->event);
       if (result != CUDA_SUCCESS) break;
       result = context->symbols->cuEventQuery(event->event);
       if (result != CUDA_SUCCESS) break;

       // Calculate context-relative time and notify tracy.
       float relative_millis = 0.0f;
       IREE_CUDA_IGNORE_ERROR(
           context->symbols,
           cuEventElapsedTime(&relative_millis, context->base_event,
                              event->event));
       int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0);
       iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp);

       read_query_count += 1;
       event = event->next_in_command_buffer;
     }
     iree_hal_cuda_tracing_context_event_t* next = events->next_submission;
     events->was_submitted = true;
     events = next;
     context->submitted_event_list.head = events;
   }
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)read_query_count);
   IREE_TRACE_ZONE_END(z0);
   iree_slim_mutex_unlock(&context->event_mutex);
 }

 void iree_hal_cuda_tracing_notify_submitted(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list) {
   if (!context) return;
   IREE_ASSERT_ARGUMENT(event_list);
   iree_slim_mutex_lock(&context->event_mutex);

   if (!event_list->head) {
     iree_slim_mutex_unlock(&context->event_mutex);
     return;
   }

   if (!context->submitted_event_list.head) {
     context->submitted_event_list.head = event_list->head;
     context->submitted_event_list.tail = event_list->head;
   } else {
     context->submitted_event_list.tail->next_submission = event_list->head;
     context->submitted_event_list.tail = event_list->head;
   }

   iree_slim_mutex_unlock(&context->event_mutex);
 }

 void iree_hal_cuda_tracing_free(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list) {
   if (!context) return;
   iree_slim_mutex_lock(&context->event_mutex);

   IREE_ASSERT_ARGUMENT(event_list);

   if (!event_list->head) {
     iree_slim_mutex_unlock(&context->event_mutex);
     return;
   }

   // Free an event list that was previously created. There is some book-keeping
   // to keep tracy happy, and then we remove the elements from the
   // passed in event_list and add them to the front of the free-list.

   // If this event list has never been submitted we still need to add values to
   // the timeline otherwise tracy will not behave correctly.
   if (!event_list->head->was_submitted) {
     iree_hal_cuda_tracing_context_event_t* event = event_list->head;
     while (event) {
       uint32_t query_id = (uint32_t)(event - &context->event_pool[0]);
       iree_tracing_gpu_zone_notify(context->id, query_id, 0);
       event = event->next_in_command_buffer;
     }
   }

   if (!context->event_freelist_head) {
     context->event_freelist_head = event_list->head;
     iree_slim_mutex_unlock(&context->event_mutex);
     return;
   }
   event_list->head->next_submission = NULL;
   event_list->head->was_submitted = false;
   event_list->tail->next_in_command_buffer = context->event_freelist_head;
   context->event_freelist_head = event_list->head;

   event_list->head = NULL;
   event_list->tail = NULL;
   iree_slim_mutex_unlock(&context->event_mutex);
 }

 static void iree_hal_cuda_tracing_context_event_list_append_event(
     iree_hal_cuda_tracing_context_event_list_t* event_list,
     iree_hal_cuda_tracing_context_event_t* event) {
   if (!event_list->head) {
     event_list->head = event;
     event_list->tail = event;
   } else {
     event_list->tail->next_in_command_buffer = event;
     event_list->tail = event;
   }
 }

 // Grabs the next available query out of the freelist and adds it to
 // the event_list that was passed in. Also starts the recording of the
 // event.
 static uint16_t iree_hal_cuda_stream_tracing_context_insert_query(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
     iree_hal_cuda_tracing_verbosity_t verbosity) {
   iree_slim_mutex_lock(&context->event_mutex);
   IREE_ASSERT_ARGUMENT(event_list);

   // Allocate an event from the pool for use by the query.
   // TODO: If we have run out of our freelist, then we need to try and recover
   // allocate events.
   iree_hal_cuda_tracing_context_event_t* event = context->event_freelist_head;
   context->event_freelist_head = event->next_in_command_buffer;
   uint32_t query_id = event - &context->event_pool[0];
   IREE_ASSERT(event->next_in_command_buffer != NULL);
   event->next_in_command_buffer = NULL;

   IREE_CUDA_IGNORE_ERROR(context->symbols, cuEventRecord(event->event, stream));

   iree_hal_cuda_tracing_context_event_list_append_event(event_list, event);

   iree_slim_mutex_unlock(&context->event_mutex);
   return query_id;
 }

 // Grabs the next available query out of the freelist and adds it to
 // the event_list that was passed in. Also inserts the event record
 // node into the passed in graph. It returns the index of the
 // event.
 static uint16_t iree_hal_cuda_graph_tracing_context_insert_query(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list,
     CUgraphNode* out_node, CUgraph graph,
     iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
     size_t dependency_nodes_count) {
   IREE_ASSERT_ARGUMENT(event_list);
   iree_slim_mutex_lock(&context->event_mutex);

   // Allocate an event from the pool for use by the query.
   // TODO: If we have run out of our freelist, then we need to try and recover
   // or allocate more events.
   iree_hal_cuda_tracing_context_event_t* event = context->event_freelist_head;
   context->event_freelist_head = event->next_in_command_buffer;
   uint32_t query_id = event - &context->event_pool[0];
   IREE_ASSERT(event->next_in_command_buffer != NULL);
   event->next_in_command_buffer = NULL;

   iree_status_t status = IREE_CURESULT_TO_STATUS(
       context->symbols,
       cuGraphAddEventRecordNode(out_node, graph, dependency_nodes,
                                 dependency_nodes_count, event->event));
   IREE_ASSERT(iree_status_is_ok(status));

   iree_hal_cuda_tracing_context_event_list_append_event(event_list, event);

   iree_slim_mutex_unlock(&context->event_mutex);
   return query_id;
 }

 // TODO: optimize this implementation to reduce the number of events required:
 // today we insert 2 events per zone (one for begin and one for end) but in
 // many cases we could reduce this by inserting events only between zones and
 // using the differences between them.

 void iree_hal_cuda_stream_tracing_zone_begin_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
     iree_hal_cuda_tracing_verbosity_t verbosity,
     const iree_tracing_location_t* src_loc) {
   if (!context) return;
   if (verbosity > context->verbosity) return;

   uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query(
       context, event_list, stream, verbosity);
   iree_tracing_gpu_zone_begin(context->id, query_id, src_loc);
 }

 void iree_hal_cuda_stream_tracing_zone_begin_external_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
     iree_hal_cuda_tracing_verbosity_t verbosity, const char* file_name,
     size_t file_name_length, uint32_t line, const char* function_name,
     size_t function_name_length, const char* name, size_t name_length) {
   if (!context) return;
   if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query(
       context, event_list, stream, verbosity);
   iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
                                        file_name_length, line, function_name,
                                        function_name_length, name, name_length);
 }

 void iree_hal_cuda_graph_tracing_zone_begin_external_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list,
     CUgraphNode* out_node, CUgraph graph,
     iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
     size_t dependency_nodes_count, const char* file_name,
     size_t file_name_length, uint32_t line, const char* function_name,
     size_t function_name_length, const char* name, size_t name_length) {
   if (!context) return;
   if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query(
       context, event_list, out_node, graph, verbosity, dependency_nodes,
       dependency_nodes_count);
   iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
                                        file_name_length, line, function_name,
                                        function_name_length, name, name_length);
 }

 void iree_hal_cuda_stream_tracing_zone_end_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
     iree_hal_cuda_tracing_verbosity_t verbosity) {
   if (!context) return;
   if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query(
       context, event_list, stream, verbosity);
   iree_tracing_gpu_zone_end(context->id, query_id);
 }

 void iree_hal_cuda_graph_tracing_zone_end_impl(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list,
     CUgraphNode* out_node, CUgraph graph,
     iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
     size_t dependency_nodes_count) {
   if (!context) return;
   if (verbosity > context->verbosity) return;
   uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query(
       context, event_list, out_node, graph, verbosity, dependency_nodes,
       dependency_nodes_count);
   iree_tracing_gpu_zone_end(context->id, query_id);
 }

 #else

 iree_status_t iree_hal_cuda_tracing_context_allocate(
     const iree_hal_cuda_dynamic_symbols_t* symbols,
     iree_string_view_t queue_name, CUstream stream,
     iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity,
     iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
     iree_hal_cuda_tracing_context_t** out_context) {
   *out_context = NULL;
   return iree_ok_status();
 }

 void iree_hal_cuda_tracing_context_free(
     iree_hal_cuda_tracing_context_t* context) {}

 void iree_hal_cuda_tracing_context_collect(
     iree_hal_cuda_tracing_context_t* context) {}

 void iree_hal_cuda_tracing_notify_submitted(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list) {}

 void iree_hal_cuda_tracing_free(
     iree_hal_cuda_tracing_context_t* context,
     iree_hal_cuda_tracing_context_event_list_t* event_list) {}

 #endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
	// Copyright 2023 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "iree/hal/drivers/cuda/tracing.h"

	#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE

	#include "iree/hal/drivers/cuda/cuda_dynamic_symbols.h"
	#include "iree/hal/drivers/cuda/cuda_status_util.h"

	// Total number of events per tracing context. This translates to the maximum
	// number of outstanding timestamp queries before collection is required.
	// To prevent spilling pages we leave some room for the context structure.
	#define IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256)

	// iree_hal_cuda_tracing_context_event_t contains a cuEvent that is used to
	// record timestamps for tracing GPU execution. In this struct, there are also
	// two linked lists that the current event may be added to during its lifetime.
	//
	// --------------------->---Submissions--->----------
	// \ \ \
	// \ \ \
	// command_buffer command_buffer command_buffer
	//
	// The submission list is owned by the tracing context and elements are
	// inserted and removed as commmand_buffers are submitted and when they
	// complete. This is a list of the head elements for each command buffer.
	// The commnad buffer list is owned by the command buffer. It is the list of
	// events used to trace command buffer dispatches.
	//
	// When the event is in the freelist, next_submission should be null, and
	// we reuse next_in_command_buffer to track the next free event.
	//
	// When the even is grabbed from the freelist to track GPU executions,
	// it is added to the list in recording command_buffer.
	struct iree_hal_cuda_tracing_context_event_t {
	CUevent event;
	iree_hal_cuda_tracing_context_event_t* next_in_command_buffer;
	iree_hal_cuda_tracing_context_event_t* next_submission;
	bool was_submitted;
	};

	struct iree_hal_cuda_tracing_context_t {
	const iree_hal_cuda_dynamic_symbols_t* symbols;
	iree_slim_mutex_t event_mutex;

	CUstream stream;
	iree_arena_block_pool_t* block_pool;
	iree_allocator_t host_allocator;

	// A unique GPU zone ID allocated from Tracy.
	// There is a global limit of 255 GPU zones (ID 255 is special).
	uint8_t id;

	// Base event used for computing relative times for all recorded events.
	// This is required as CUDA (without CUPTI) only allows for relative timing
	// between events and we need a stable base event.
	CUevent base_event;

	// Unallocated event list head. next_in_command_buffer points to the next
	// available event.
	iree_hal_cuda_tracing_context_event_t* event_freelist_head;

	// Submitted events.
	iree_hal_cuda_tracing_context_event_list_t submitted_event_list;

	uint32_t query_capacity;

	iree_hal_cuda_tracing_verbosity_t verbosity;

	// Event pool reused to capture tracing timestamps.
	// The lifetime of the events are as follows.
	// 1) All events are allocated when the tracing context is created.
	// 2) When a command_buffer inserts a query via:
	// iree_hal_cuda_**_tracing_context_insert_query
	// an event is pulled from the event freelist and added to the
	// command buffer.
	// 3) When a command buffer is dispatched and
	// iree_hal_cuda_tracing_notify_submitted is called, the events
	// for that command buffer are added to the submitted_event_list.
	// 4) When the command buffer completes iree_hal_cuda_tracing_context_collect
	// is called, and the events are removed from submitted_event_list as
	// we collect their values.
	// 5) When the command buffer is destroyed, all events are put at the front
	// of event_freelist.
	iree_hal_cuda_tracing_context_event_t
	event_pool[IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY];
	};

	static iree_status_t iree_hal_cuda_tracing_context_initial_calibration(
	const iree_hal_cuda_dynamic_symbols_t* symbols, CUstream stream,
	CUevent base_event, int64_t* out_cpu_timestamp, int64_t* out_gpu_timestamp,
	float* out_timestamp_period) {
	IREE_TRACE_ZONE_BEGIN(z0);
	*out_cpu_timestamp = 0;
	*out_gpu_timestamp = 0;
	*out_timestamp_period = 1.0f;

	// Record event to the stream; in the absence of a synchronize this may not
	// flush immediately.
	IREE_RETURN_AND_END_ZONE_IF_ERROR(
	z0, IREE_CURESULT_TO_STATUS(symbols, cuEventRecord(base_event, stream)));

	// Force flush the event and wait for it to complete.
	IREE_RETURN_AND_END_ZONE_IF_ERROR(
	z0, IREE_CURESULT_TO_STATUS(symbols, cuEventSynchronize(base_event)));

	// Track when we know the event has completed and has a reasonable timestamp.
	// This may drift from the actual time differential between host/device but is
	// (maybe?) the best we can do without CUPTI.
	*out_cpu_timestamp = iree_tracing_time();

	IREE_TRACE_ZONE_END(z0);
	return iree_ok_status();
	}

	iree_status_t iree_hal_cuda_tracing_context_allocate(
	const iree_hal_cuda_dynamic_symbols_t* symbols,
	iree_string_view_t queue_name, CUstream stream,
	iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity,
	iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
	iree_hal_cuda_tracing_context_t** out_context) {
	IREE_TRACE_ZONE_BEGIN(z0);
	IREE_ASSERT_ARGUMENT(symbols);
	IREE_ASSERT_ARGUMENT(stream);
	IREE_ASSERT_ARGUMENT(block_pool);
	IREE_ASSERT_ARGUMENT(out_context);
	*out_context = NULL;

	iree_hal_cuda_tracing_context_t* context = NULL;
	iree_status_t status =
	iree_allocator_malloc(host_allocator, sizeof(context), (void*)&context);
	if (iree_status_is_ok(status)) {
	context->symbols = symbols;
	context->stream = stream;
	context->block_pool = block_pool;
	context->host_allocator = host_allocator;
	context->query_capacity = IREE_ARRAYSIZE(context->event_pool);
	context->submitted_event_list.head = NULL;
	context->submitted_event_list.tail = NULL;
	context->verbosity = stream_tracing_verbosity;
	iree_slim_mutex_initialize(&context->event_mutex);
	}

	// Pre-allocate all events in the event pool.
	if (iree_status_is_ok(status)) {
	IREE_TRACE_ZONE_BEGIN_NAMED(
	z_event_pool, "iree_hal_cuda_tracing_context_allocate_event_pool");
	IREE_TRACE_ZONE_APPEND_VALUE_I64(z_event_pool,
	(int64_t)context->query_capacity);
	context->event_freelist_head = &context->event_pool[0];
	for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
	status = IREE_CURESULT_TO_STATUS(
	symbols,
	cuEventCreate(&context->event_pool[i].event, CU_EVENT_DEFAULT));
	if (!iree_status_is_ok(status)) break;
	if (i > 0) {
	context->event_pool[i - 1].next_in_command_buffer =
	&context->event_pool[i];
	}
	context->event_pool[i].next_submission = NULL;
	context->event_pool[i].was_submitted = false;
	if (i + 1 == context->query_capacity) {
	context->event_pool[i].next_in_command_buffer = NULL;
	}
	}
	IREE_TRACE_ZONE_END(z_event_pool);
	}

	// Create the initial GPU event and insert it into the stream.
	// All events we record are relative to this event.
	int64_t cpu_timestamp = 0;
	int64_t gpu_timestamp = 0;
	float timestamp_period = 0.0f;
	if (iree_status_is_ok(status)) {
	status = IREE_CURESULT_TO_STATUS(
	symbols, cuEventCreate(&context->base_event, CU_EVENT_DEFAULT));
	}
	if (iree_status_is_ok(status)) {
	status = iree_hal_cuda_tracing_context_initial_calibration(
	symbols, stream, context->base_event, &cpu_timestamp, &gpu_timestamp,
	&timestamp_period);
	}

	// Allocate the GPU context and pass initial calibration data.
	if (iree_status_is_ok(status)) {
	context->id = iree_tracing_gpu_context_allocate(
	IREE_TRACING_GPU_CONTEXT_TYPE_VULKAN, queue_name.data, queue_name.size,
	/is_calibrated=/false, cpu_timestamp, gpu_timestamp,
	timestamp_period);
	}

	if (iree_status_is_ok(status)) {
	*out_context = context;
	} else {
	iree_hal_cuda_tracing_context_free(context);
	}
	IREE_TRACE_ZONE_END(z0);
	return status;
	}

	void iree_hal_cuda_tracing_context_free(
	iree_hal_cuda_tracing_context_t* context) {
	if (!context) return;
	IREE_TRACE_ZONE_BEGIN(z0);

	// Always perform a collection on shutdown.
	iree_hal_cuda_tracing_context_collect(context);

	// Release all events; since collection completed they should all be unused.
	IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool,
	"iree_hal_cuda_tracing_context_free_event_pool");
	for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
	if (context->event_pool[i].event) {
	IREE_CUDA_IGNORE_ERROR(context->symbols,
	cuEventDestroy(context->event_pool[i].event));
	}
	}
	IREE_TRACE_ZONE_END(z_event_pool);
	if (context->base_event) {
	IREE_CUDA_IGNORE_ERROR(context->symbols,
	cuEventDestroy(context->base_event));
	}

	iree_slim_mutex_deinitialize(&context->event_mutex);

	iree_allocator_t host_allocator = context->host_allocator;
	iree_allocator_free(host_allocator, context);

	IREE_TRACE_ZONE_END(z0);
	}

	void iree_hal_cuda_tracing_context_collect(
	iree_hal_cuda_tracing_context_t* context) {
	if (!context) return;
	iree_slim_mutex_lock(&context->event_mutex);

	// No outstanding queries
	if (!context->submitted_event_list.head) {
	iree_slim_mutex_unlock(&context->event_mutex);
	return;
	}
	IREE_TRACE_ZONE_BEGIN(z0);
	// submitted_event_list is a list of the head elements for each command
	// buffer that has been submitted. Here we loop over all of the events,
	// wait for them to complete and gather the results with cuEventQuery.

	iree_hal_cuda_tracing_context_event_t* events =
	context->submitted_event_list.head;
	uint32_t read_query_count = 0;
	// Outer per-command_buffer loop.
	while (events) {
	iree_hal_cuda_tracing_context_event_t* event = events;
	// Inner per-event loop.
	while (event) {
	uint32_t query_id = (uint32_t)(event - &context->event_pool[0]);

	CUresult result = context->symbols->cuEventSynchronize(event->event);
	if (result != CUDA_SUCCESS) break;
	result = context->symbols->cuEventQuery(event->event);
	if (result != CUDA_SUCCESS) break;

	// Calculate context-relative time and notify tracy.
	float relative_millis = 0.0f;
	IREE_CUDA_IGNORE_ERROR(
	context->symbols,
	cuEventElapsedTime(&relative_millis, context->base_event,
	event->event));
	int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0);
	iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp);

	read_query_count += 1;
	event = event->next_in_command_buffer;
	}
	iree_hal_cuda_tracing_context_event_t* next = events->next_submission;
	events->was_submitted = true;
	events = next;
	context->submitted_event_list.head = events;
	}
	IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)read_query_count);
	IREE_TRACE_ZONE_END(z0);
	iree_slim_mutex_unlock(&context->event_mutex);
	}

	void iree_hal_cuda_tracing_notify_submitted(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list) {
	if (!context) return;
	IREE_ASSERT_ARGUMENT(event_list);
	iree_slim_mutex_lock(&context->event_mutex);

	if (!event_list->head) {
	iree_slim_mutex_unlock(&context->event_mutex);
	return;
	}

	if (!context->submitted_event_list.head) {
	context->submitted_event_list.head = event_list->head;
	context->submitted_event_list.tail = event_list->head;
	} else {
	context->submitted_event_list.tail->next_submission = event_list->head;
	context->submitted_event_list.tail = event_list->head;
	}

	iree_slim_mutex_unlock(&context->event_mutex);
	}

	void iree_hal_cuda_tracing_free(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list) {
	if (!context) return;
	iree_slim_mutex_lock(&context->event_mutex);

	IREE_ASSERT_ARGUMENT(event_list);

	if (!event_list->head) {
	iree_slim_mutex_unlock(&context->event_mutex);
	return;
	}

	// Free an event list that was previously created. There is some book-keeping
	// to keep tracy happy, and then we remove the elements from the
	// passed in event_list and add them to the front of the free-list.

	// If this event list has never been submitted we still need to add values to
	// the timeline otherwise tracy will not behave correctly.
	if (!event_list->head->was_submitted) {
	iree_hal_cuda_tracing_context_event_t* event = event_list->head;
	while (event) {
	uint32_t query_id = (uint32_t)(event - &context->event_pool[0]);
	iree_tracing_gpu_zone_notify(context->id, query_id, 0);
	event = event->next_in_command_buffer;
	}
	}

	if (!context->event_freelist_head) {
	context->event_freelist_head = event_list->head;
	iree_slim_mutex_unlock(&context->event_mutex);
	return;
	}
	event_list->head->next_submission = NULL;
	event_list->head->was_submitted = false;
	event_list->tail->next_in_command_buffer = context->event_freelist_head;
	context->event_freelist_head = event_list->head;

	event_list->head = NULL;
	event_list->tail = NULL;
	iree_slim_mutex_unlock(&context->event_mutex);
	}

	static void iree_hal_cuda_tracing_context_event_list_append_event(
	iree_hal_cuda_tracing_context_event_list_t* event_list,
	iree_hal_cuda_tracing_context_event_t* event) {
	if (!event_list->head) {
	event_list->head = event;
	event_list->tail = event;
	} else {
	event_list->tail->next_in_command_buffer = event;
	event_list->tail = event;
	}
	}

	// Grabs the next available query out of the freelist and adds it to
	// the event_list that was passed in. Also starts the recording of the
	// event.
	static uint16_t iree_hal_cuda_stream_tracing_context_insert_query(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
	iree_hal_cuda_tracing_verbosity_t verbosity) {
	iree_slim_mutex_lock(&context->event_mutex);
	IREE_ASSERT_ARGUMENT(event_list);

	// Allocate an event from the pool for use by the query.
	// TODO: If we have run out of our freelist, then we need to try and recover
	// allocate events.
	iree_hal_cuda_tracing_context_event_t* event = context->event_freelist_head;
	context->event_freelist_head = event->next_in_command_buffer;
	uint32_t query_id = event - &context->event_pool[0];
	IREE_ASSERT(event->next_in_command_buffer != NULL);
	event->next_in_command_buffer = NULL;

	IREE_CUDA_IGNORE_ERROR(context->symbols, cuEventRecord(event->event, stream));

	iree_hal_cuda_tracing_context_event_list_append_event(event_list, event);

	iree_slim_mutex_unlock(&context->event_mutex);
	return query_id;
	}

	// Grabs the next available query out of the freelist and adds it to
	// the event_list that was passed in. Also inserts the event record
	// node into the passed in graph. It returns the index of the
	// event.
	static uint16_t iree_hal_cuda_graph_tracing_context_insert_query(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list,
	CUgraphNode* out_node, CUgraph graph,
	iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
	size_t dependency_nodes_count) {
	IREE_ASSERT_ARGUMENT(event_list);
	iree_slim_mutex_lock(&context->event_mutex);

	// Allocate an event from the pool for use by the query.
	// TODO: If we have run out of our freelist, then we need to try and recover
	// or allocate more events.
	iree_hal_cuda_tracing_context_event_t* event = context->event_freelist_head;
	context->event_freelist_head = event->next_in_command_buffer;
	uint32_t query_id = event - &context->event_pool[0];
	IREE_ASSERT(event->next_in_command_buffer != NULL);
	event->next_in_command_buffer = NULL;

	iree_status_t status = IREE_CURESULT_TO_STATUS(
	context->symbols,
	cuGraphAddEventRecordNode(out_node, graph, dependency_nodes,
	dependency_nodes_count, event->event));
	IREE_ASSERT(iree_status_is_ok(status));

	iree_hal_cuda_tracing_context_event_list_append_event(event_list, event);

	iree_slim_mutex_unlock(&context->event_mutex);
	return query_id;
	}

	// TODO: optimize this implementation to reduce the number of events required:
	// today we insert 2 events per zone (one for begin and one for end) but in
	// many cases we could reduce this by inserting events only between zones and
	// using the differences between them.

	void iree_hal_cuda_stream_tracing_zone_begin_impl(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
	iree_hal_cuda_tracing_verbosity_t verbosity,
	const iree_tracing_location_t* src_loc) {
	if (!context) return;
	if (verbosity > context->verbosity) return;

	uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query(
	context, event_list, stream, verbosity);
	iree_tracing_gpu_zone_begin(context->id, query_id, src_loc);
	}

	void iree_hal_cuda_stream_tracing_zone_begin_external_impl(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
	iree_hal_cuda_tracing_verbosity_t verbosity, const char* file_name,
	size_t file_name_length, uint32_t line, const char* function_name,
	size_t function_name_length, const char* name, size_t name_length) {
	if (!context) return;
	if (verbosity > context->verbosity) return;
	uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query(
	context, event_list, stream, verbosity);
	iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
	file_name_length, line, function_name,
	function_name_length, name, name_length);
	}

	void iree_hal_cuda_graph_tracing_zone_begin_external_impl(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list,
	CUgraphNode* out_node, CUgraph graph,
	iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
	size_t dependency_nodes_count, const char* file_name,
	size_t file_name_length, uint32_t line, const char* function_name,
	size_t function_name_length, const char* name, size_t name_length) {
	if (!context) return;
	if (verbosity > context->verbosity) return;
	uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query(
	context, event_list, out_node, graph, verbosity, dependency_nodes,
	dependency_nodes_count);
	iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
	file_name_length, line, function_name,
	function_name_length, name, name_length);
	}

	void iree_hal_cuda_stream_tracing_zone_end_impl(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream,
	iree_hal_cuda_tracing_verbosity_t verbosity) {
	if (!context) return;
	if (verbosity > context->verbosity) return;
	uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query(
	context, event_list, stream, verbosity);
	iree_tracing_gpu_zone_end(context->id, query_id);
	}

	void iree_hal_cuda_graph_tracing_zone_end_impl(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list,
	CUgraphNode* out_node, CUgraph graph,
	iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes,
	size_t dependency_nodes_count) {
	if (!context) return;
	if (verbosity > context->verbosity) return;
	uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query(
	context, event_list, out_node, graph, verbosity, dependency_nodes,
	dependency_nodes_count);
	iree_tracing_gpu_zone_end(context->id, query_id);
	}

	#else

	iree_status_t iree_hal_cuda_tracing_context_allocate(
	const iree_hal_cuda_dynamic_symbols_t* symbols,
	iree_string_view_t queue_name, CUstream stream,
	iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity,
	iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
	iree_hal_cuda_tracing_context_t** out_context) {
	*out_context = NULL;
	return iree_ok_status();
	}

	void iree_hal_cuda_tracing_context_free(
	iree_hal_cuda_tracing_context_t* context) {}

	void iree_hal_cuda_tracing_context_collect(
	iree_hal_cuda_tracing_context_t* context) {}

	void iree_hal_cuda_tracing_notify_submitted(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list) {}

	void iree_hal_cuda_tracing_free(
	iree_hal_cuda_tracing_context_t* context,
	iree_hal_cuda_tracing_context_event_list_t* event_list) {}

	#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE