[cuda] Port over tracing utilities and use in NCCL channel (#14063)
The main change is removing the context wrapper and including CUDA
dynamic symbols directly.
Progress towards https://github.com/openxla/iree/issues/13245
diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt
index 2992609..b9be87d 100644
--- a/experimental/cuda2/CMakeLists.txt
+++ b/experimental/cuda2/CMakeLists.txt
@@ -34,6 +34,8 @@
"nccl_channel.h"
"pipeline_layout.c"
"pipeline_layout.h"
+ "tracing.c"
+ "tracing.h"
DEPS
::dynamic_symbols
iree::base
diff --git a/experimental/cuda2/api.h b/experimental/cuda2/api.h
index 20951b0..5df17c5 100644
--- a/experimental/cuda2/api.h
+++ b/experimental/cuda2/api.h
@@ -59,6 +59,16 @@
// transient allocations while also increasing memory consumption.
iree_host_size_t arena_block_size;
+ // Enables tracing of command buffers when IREE tracing is enabled.
+ // May take advantage of additional extensions for more accurate timing or
+ // hardware-specific performance counters.
+ //
+ // NOTE: tracing has a non-trivial overhead and will skew the timing of
+ // submissions and introduce false barriers between dispatches. Use this to
+ // identify slow dispatches and refine from there; be wary of whole-program
+ // tracing with this enabled.
+ bool stream_tracing;
+
// Whether to use async allocations even if reported as available by the
// device. Defaults to true when the device supports it.
bool async_allocations;
diff --git a/experimental/cuda2/cuda_device.c b/experimental/cuda2/cuda_device.c
index 36bf8ab..e51d326 100644
--- a/experimental/cuda2/cuda_device.c
+++ b/experimental/cuda2/cuda_device.c
@@ -19,6 +19,7 @@
#include "experimental/cuda2/nccl_dynamic_symbols.h"
#include "experimental/cuda2/nop_executable_cache.h"
#include "experimental/cuda2/pipeline_layout.h"
+#include "experimental/cuda2/tracing.h"
#include "iree/base/internal/arena.h"
#include "iree/base/internal/math.h"
#include "iree/hal/utils/buffer_transfer.h"
@@ -53,6 +54,8 @@
// TODO: support multiple streams.
CUstream cu_stream;
+ iree_hal_cuda2_tracing_context_t* tracing_context;
+
iree_allocator_t host_allocator;
// Device memory pools and allocators.
@@ -82,6 +85,7 @@
memset(out_params, 0, sizeof(*out_params));
out_params->arena_block_size = 32 * 1024;
out_params->queue_count = 1;
+ out_params->stream_tracing = false;
out_params->async_allocations = true;
}
@@ -128,7 +132,13 @@
device->cu_stream = stream;
device->host_allocator = host_allocator;
+ // Enable tracing for the (currently only) stream - no-op if disabled.
iree_status_t status = iree_ok_status();
+ if (device->params.stream_tracing) {
+ status = iree_hal_cuda2_tracing_context_allocate(
+ device->cuda_symbols, device->identifier, stream, &device->block_pool,
+ host_allocator, &device->tracing_context);
+ }
// Memory pool support is conditional.
if (iree_status_is_ok(status) && params->async_allocations) {
@@ -237,6 +247,7 @@
iree_hal_cuda2_memory_pools_deinitialize(&device->memory_pools);
// TODO: support multiple streams.
+ iree_hal_cuda2_tracing_context_free(device->tracing_context);
IREE_CUDA_IGNORE_ERROR(device->cuda_symbols,
cuStreamDestroy(device->cu_stream));
diff --git a/experimental/cuda2/nccl_channel.c b/experimental/cuda2/nccl_channel.c
index e0bf3dc..55ffc52 100644
--- a/experimental/cuda2/nccl_channel.c
+++ b/experimental/cuda2/nccl_channel.c
@@ -544,11 +544,29 @@
iree_status_t iree_hal_cuda2_nccl_submit_batch(
const iree_hal_cuda2_nccl_dynamic_symbols_t* symbols,
+ iree_hal_cuda2_tracing_context_t* tracing_context,
const iree_hal_collective_batch_t* batch, CUstream stream) {
IREE_ASSERT_ARGUMENT(symbols);
IREE_ASSERT_ARGUMENT(batch);
IREE_ASSERT_ARGUMENT(stream);
+ // Begin one zone for each entry in the batch. Each entry will show stacked on
+ // top of each other and unfortunately use independent CUDA events. We could
+ // optimize this by changing the tracing context to expose an API with event
+ // reservation and then zone commit using an existing event.
+ IREE_TRACE({
+ iree_bitfield_string_temp_t string_temp;
+ for (iree_host_size_t i = 0; i < batch->count; ++i) {
+ iree_hal_collective_batch_entry_t* entry = &batch->entries[i];
+ iree_string_view_t collective_str =
+ iree_hal_collective_op_format(&entry->op, &string_temp);
+ IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL(
+ tracing_context, stream, __FILE__, strlen(__FILE__),
+ (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__),
+ collective_str.data, collective_str.size);
+ }
+ });
+
// Issue all collective operations in the batch as part of a group.
// NCCL may be able to fuse or reduce overheads by issuing like this.
IREE_NCCL_RETURN_IF_ERROR(symbols, ncclGroupStart(), "ncclGroupStart");
@@ -558,6 +576,14 @@
}
IREE_NCCL_RETURN_IF_ERROR(symbols, ncclGroupEnd(), "ncclGroupEnd");
+ // End all zones we began above - note that these are just simply nested so
+ // order doesn't matter so long as we end the right number of zones.
+ IREE_TRACE({
+ for (iree_host_size_t i = 0; i < batch->count; ++i) {
+ IREE_CUDA_TRACE_ZONE_END(tracing_context, stream);
+ }
+ });
+
return iree_ok_status();
}
diff --git a/experimental/cuda2/nccl_channel.h b/experimental/cuda2/nccl_channel.h
index a6da61f..ad8bfef 100644
--- a/experimental/cuda2/nccl_channel.h
+++ b/experimental/cuda2/nccl_channel.h
@@ -11,6 +11,7 @@
#include "experimental/cuda2/cuda_dynamic_symbols.h"
#include "experimental/cuda2/cuda_headers.h"
#include "experimental/cuda2/nccl_dynamic_symbols.h"
+#include "experimental/cuda2/tracing.h"
#include "iree/base/api.h"
#include "iree/hal/api.h"
#include "iree/hal/utils/collective_batch.h"
@@ -48,6 +49,7 @@
// Note that operations in the batch may apply to different channels.
iree_status_t iree_hal_cuda2_nccl_submit_batch(
const iree_hal_cuda2_nccl_dynamic_symbols_t* nccl_symbols,
+ iree_hal_cuda2_tracing_context_t* tracing_context,
const iree_hal_collective_batch_t* batch, CUstream stream);
#ifdef __cplusplus
diff --git a/experimental/cuda2/registration/driver_module.c b/experimental/cuda2/registration/driver_module.c
index 33d93a3..f7d46af 100644
--- a/experimental/cuda2/registration/driver_module.c
+++ b/experimental/cuda2/registration/driver_module.c
@@ -17,6 +17,12 @@
bool, cuda_async_allocations, true,
"Enables CUDA asynchronous stream-ordered allocations when supported.");
+IREE_FLAG(
+ bool, cuda_tracing, true,
+ "Enables tracing of stream events when Tracy instrumentation is enabled.\n"
+ "Severely impacts benchmark timings and should only be used when\n"
+ "analyzing dispatch timings.");
+
IREE_FLAG(int32_t, cuda2_default_index, 0,
"Specifies the index of the default CUDA device to use");
@@ -84,6 +90,7 @@
iree_hal_cuda2_device_params_t device_params;
iree_hal_cuda2_device_params_initialize(&device_params);
+ device_params.stream_tracing = FLAG_cuda_tracing;
device_params.async_allocations = FLAG_cuda_async_allocations;
driver_options.default_device_index = FLAG_cuda2_default_index;
diff --git a/experimental/cuda2/tracing.c b/experimental/cuda2/tracing.c
new file mode 100644
index 0000000..6243ad8
--- /dev/null
+++ b/experimental/cuda2/tracing.c
@@ -0,0 +1,293 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/tracing.h"
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_status_util.h"
+
+// Total number of events per tracing context. This translates to the maximum
+// number of outstanding timestamp queries before collection is required.
+// To prevent spilling pages we leave some room for the context structure.
+#define IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256)
+
+struct iree_hal_cuda2_tracing_context_t {
+ const iree_hal_cuda2_dynamic_symbols_t* symbols;
+
+ CUstream stream;
+ iree_arena_block_pool_t* block_pool;
+ iree_allocator_t host_allocator;
+
+ // A unique GPU zone ID allocated from Tracy.
+ // There is a global limit of 255 GPU zones (ID 255 is special).
+ uint8_t id;
+
+ // Base event used for computing relative times for all recorded events.
+ // This is required as CUDA (without CUPTI) only allows for relative timing
+ // between events and we need a stable base event.
+ CUevent base_event;
+
+ // Indices into |event_pool| defining a ringbuffer.
+ uint32_t query_head;
+ uint32_t query_tail;
+ uint32_t query_capacity;
+
+ // Event pool reused to capture tracing timestamps.
+ CUevent event_pool[IREE_HAL_CUDA_TRACING_DEFAULT_QUERY_CAPACITY];
+};
+
+static iree_status_t iree_hal_cuda2_tracing_context_initial_calibration(
+ const iree_hal_cuda2_dynamic_symbols_t* symbols, CUstream stream,
+ CUevent base_event, int64_t* out_cpu_timestamp, int64_t* out_gpu_timestamp,
+ float* out_timestamp_period) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ *out_cpu_timestamp = 0;
+ *out_gpu_timestamp = 0;
+ *out_timestamp_period = 1.0f;
+
+ // Record event to the stream; in the absence of a synchronize this may not
+ // flush immediately.
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, IREE_CURESULT_TO_STATUS(symbols, cuEventRecord(base_event, stream)));
+
+ // Force flush the event and wait for it to complete.
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, IREE_CURESULT_TO_STATUS(symbols, cuEventSynchronize(base_event)));
+
+ // Track when we know the event has completed and has a reasonable timestamp.
+ // This may drift from the actual time differential between host/device but is
+ // (maybe?) the best we can do without CUPTI.
+ *out_cpu_timestamp = iree_tracing_time();
+
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+iree_status_t iree_hal_cuda2_tracing_context_allocate(
+ const iree_hal_cuda2_dynamic_symbols_t* symbols,
+ iree_string_view_t queue_name, CUstream stream,
+ iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+ iree_hal_cuda2_tracing_context_t** out_context) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_ASSERT_ARGUMENT(symbols);
+ IREE_ASSERT_ARGUMENT(stream);
+ IREE_ASSERT_ARGUMENT(block_pool);
+ IREE_ASSERT_ARGUMENT(out_context);
+ *out_context = NULL;
+
+ iree_hal_cuda2_tracing_context_t* context = NULL;
+ iree_status_t status =
+ iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context);
+ if (iree_status_is_ok(status)) {
+ context->symbols = symbols;
+ context->stream = stream;
+ context->block_pool = block_pool;
+ context->host_allocator = host_allocator;
+ context->query_capacity = IREE_ARRAYSIZE(context->event_pool);
+ }
+
+ // Pre-allocate all events in the event pool.
+ if (iree_status_is_ok(status)) {
+ IREE_TRACE_ZONE_BEGIN_NAMED(
+ z_event_pool, "iree_hal_cuda2_tracing_context_allocate_event_pool");
+ IREE_TRACE_ZONE_APPEND_VALUE(z_event_pool,
+ (int64_t)context->query_capacity);
+ for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
+ status = IREE_CURESULT_TO_STATUS(
+ symbols, cuEventCreate(&context->event_pool[i], CU_EVENT_DEFAULT));
+ if (!iree_status_is_ok(status)) break;
+ }
+ IREE_TRACE_ZONE_END(z_event_pool);
+ }
+
+ // Create the initial GPU event and insert it into the stream.
+ // All events we record are relative to this event.
+ int64_t cpu_timestamp = 0;
+ int64_t gpu_timestamp = 0;
+ float timestamp_period = 0.0f;
+ if (iree_status_is_ok(status)) {
+ status = IREE_CURESULT_TO_STATUS(
+ symbols, cuEventCreate(&context->base_event, CU_EVENT_DEFAULT));
+ }
+ if (iree_status_is_ok(status)) {
+ status = iree_hal_cuda2_tracing_context_initial_calibration(
+ symbols, stream, context->base_event, &cpu_timestamp, &gpu_timestamp,
+ ×tamp_period);
+ }
+
+ // Allocate the GPU context and pass initial calibration data.
+ if (iree_status_is_ok(status)) {
+ context->id = iree_tracing_gpu_context_allocate(
+ IREE_TRACING_GPU_CONTEXT_TYPE_VULKAN, queue_name.data, queue_name.size,
+ /*is_calibrated=*/false, cpu_timestamp, gpu_timestamp,
+ timestamp_period);
+ }
+
+ if (iree_status_is_ok(status)) {
+ *out_context = context;
+ } else {
+ iree_hal_cuda2_tracing_context_free(context);
+ }
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+void iree_hal_cuda2_tracing_context_free(
+ iree_hal_cuda2_tracing_context_t* context) {
+ if (!context) return;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Always perform a collection on shutdown.
+ iree_hal_cuda2_tracing_context_collect(context);
+
+ // Release all events; since collection completed they should all be unused.
+ IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool,
+ "iree_hal_cuda2_tracing_context_free_event_pool");
+ for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
+ if (context->event_pool[i]) {
+ IREE_CUDA_IGNORE_ERROR(context->symbols,
+ cuEventDestroy(context->event_pool[i]));
+ }
+ }
+ IREE_TRACE_ZONE_END(z_event_pool);
+ if (context->base_event) {
+ IREE_CUDA_IGNORE_ERROR(context->symbols,
+ cuEventDestroy(context->base_event));
+ }
+
+ iree_allocator_t host_allocator = context->host_allocator;
+ iree_allocator_free(host_allocator, context);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_hal_cuda2_tracing_context_collect(
+ iree_hal_cuda2_tracing_context_t* context) {
+ if (!context) return;
+ if (context->query_tail == context->query_head) {
+ // No outstanding queries.
+ return;
+ }
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ while (context->query_tail != context->query_head) {
+ // Compute the contiguous range of queries ready to be read.
+ // If the ringbuffer wraps around we'll handle that in the next loop.
+ uint32_t try_query_count =
+ context->query_head < context->query_tail
+ ? context->query_capacity - context->query_tail
+ : context->query_head - context->query_tail;
+ IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)try_query_count);
+
+ // Scan and feed the times to tracy, stopping when we hit the first
+ // unavailable query.
+ uint32_t query_base = context->query_tail;
+ uint32_t read_query_count = 0;
+ for (uint32_t i = 0; i < try_query_count; ++i) {
+ // Ensure the event has completed; will return CUDA_ERROR_NOT_READY if
+ // recorded but not retired or any other deferred error.
+ uint16_t query_id = (uint16_t)(query_base + i);
+ CUevent query_event = context->event_pool[query_id];
+ CUresult result = context->symbols->cuEventQuery(query_event);
+ if (result != CUDA_SUCCESS) break;
+
+ // Calculate context-relative time and notify tracy.
+ float relative_millis = 0.0f;
+ IREE_CUDA_IGNORE_ERROR(
+ context->symbols,
+ cuEventElapsedTime(&relative_millis, context->base_event,
+ query_event));
+ int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0);
+ iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp);
+
+ read_query_count = i + 1;
+ }
+ IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)read_query_count);
+
+ context->query_tail += read_query_count;
+ if (context->query_tail >= context->query_capacity) {
+ context->query_tail = 0;
+ }
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+static uint16_t iree_hal_cuda2_tracing_context_insert_query(
+ iree_hal_cuda2_tracing_context_t* context, CUstream stream) {
+ // Allocate an event from the pool for use by the query.
+ uint32_t query_id = context->query_head;
+ context->query_head = (context->query_head + 1) % context->query_capacity;
+
+ // TODO: check to see if the read and write heads of the ringbuffer have
+ // overlapped. If they have we could try to collect but it's not guaranteed
+ // that collection will complete (e.g. we may be reserving events for use in
+ // graphs that haven't yet been launched).
+ //
+ // For now we just allow the overlap and tracing results will be inconsistent.
+ IREE_ASSERT_NE(context->query_head, context->query_tail);
+
+ CUevent event = context->event_pool[query_id];
+ IREE_CUDA_IGNORE_ERROR(context->symbols, cuEventRecord(event, stream));
+
+ return query_id;
+}
+
+// TODO: optimize this implementation to reduce the number of events required:
+// today we insert 2 events per zone (one for begin and one for end) but in
+// many cases we could reduce this by inserting events only between zones and
+// using the differences between them.
+
+void iree_hal_cuda2_tracing_zone_begin_impl(
+ iree_hal_cuda2_tracing_context_t* context, CUstream stream,
+ const iree_tracing_location_t* src_loc) {
+ if (!context) return;
+ uint16_t query_id =
+ iree_hal_cuda2_tracing_context_insert_query(context, stream);
+ iree_tracing_gpu_zone_begin(context->id, query_id, src_loc);
+}
+
+void iree_hal_cuda2_tracing_zone_begin_external_impl(
+ iree_hal_cuda2_tracing_context_t* context, CUstream stream,
+ const char* file_name, size_t file_name_length, uint32_t line,
+ const char* function_name, size_t function_name_length, const char* name,
+ size_t name_length) {
+ if (!context) return;
+ uint16_t query_id =
+ iree_hal_cuda2_tracing_context_insert_query(context, stream);
+ iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
+ file_name_length, line, function_name,
+ function_name_length, name, name_length);
+}
+
+void iree_hal_cuda2_tracing_zone_end_impl(
+ iree_hal_cuda2_tracing_context_t* context, CUstream stream) {
+ if (!context) return;
+ uint16_t query_id =
+ iree_hal_cuda2_tracing_context_insert_query(context, stream);
+ iree_tracing_gpu_zone_end(context->id, query_id);
+}
+
+#else
+
+iree_status_t iree_hal_cuda2_tracing_context_allocate(
+ const iree_hal_cuda2_dynamic_symbols_t* symbols,
+ iree_string_view_t queue_name, CUstream stream,
+ iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+ iree_hal_cuda2_tracing_context_t** out_context) {
+ *out_context = NULL;
+ return iree_ok_status();
+}
+
+void iree_hal_cuda2_tracing_context_free(
+ iree_hal_cuda2_tracing_context_t* context) {}
+
+void iree_hal_cuda2_tracing_context_collect(
+ iree_hal_cuda2_tracing_context_t* context) {}
+
+#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
diff --git a/experimental/cuda2/tracing.h b/experimental/cuda2/tracing.h
new file mode 100644
index 0000000..57b6786
--- /dev/null
+++ b/experimental/cuda2/tracing.h
@@ -0,0 +1,122 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef EXPERIMENTAL_CUDA2_TRACING_H_
+#define EXPERIMENTAL_CUDA2_TRACING_H_
+
+#include "experimental/cuda2/cuda_dynamic_symbols.h"
+#include "experimental/cuda2/cuda_headers.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/arena.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Per-stream CUDA tracing context.
+// No-op if IREE tracing is not enabled.
+//
+// Use the IREE_CUDA_TRACE_* macros to trace a contiguous set of stream
+// operations. Unlike the normal tracy macros there are no zone IDs and instead
+// each stream gets an ID allocated once and passed to all tracing macros.
+//
+// Usage:
+// IREE_CUDA_TRACE_ZONE_BEGIN(queue->tracing_context, stream);
+// cuLaunchKernel(..., stream);
+// IREE_CUDA_TRACE_ZONE_END(queue->tracing_context, stream);
+// ...
+// iree_hal_cuda2_tracing_context_collect(queue->tracing_context);
+//
+// NOTE: timestamps can have non-trivial side-effecting behavior and may
+// introduce serialization in graph execution.
+//
+// TODO(benvanik): expose CUevent reservation separate from recording. For
+// graphs we will need to insert the events but in order to reuse the graphs
+// we'll need to reserve and patch new events each graph launch. For now we
+// don't instrument graphs.
+//
+// Thread-compatible: external synchronization is required if using from
+// multiple threads (same as with CUstream itself).
+typedef struct iree_hal_cuda2_tracing_context_t
+ iree_hal_cuda2_tracing_context_t;
+
+// Allocates a tracing context for the given CUDA |stream|.
+// Each context must only be used with the stream it was created for.
+iree_status_t iree_hal_cuda2_tracing_context_allocate(
+ const iree_hal_cuda2_dynamic_symbols_t* symbols,
+ iree_string_view_t queue_name, CUstream stream,
+ iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+ iree_hal_cuda2_tracing_context_t** out_context);
+
+// Frees a tracing context and all associated CUDA resources.
+// All submissions using the resources must be completed prior to calling.
+void iree_hal_cuda2_tracing_context_free(
+ iree_hal_cuda2_tracing_context_t* context);
+
+// Collects in-flight timestamp queries from the stream and feeds them to tracy.
+// Must be called frequently (every submission, etc) to drain the backlog;
+// tracing may start failing if the internal ringbuffer is exceeded.
+void iree_hal_cuda2_tracing_context_collect(
+ iree_hal_cuda2_tracing_context_t* context);
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+// Begins a normal zone derived on the calling |src_loc|.
+// Must be perfectly nested and paired with a corresponding zone end.
+void iree_hal_cuda2_tracing_zone_begin_impl(
+ iree_hal_cuda2_tracing_context_t* context, CUstream stream,
+ const iree_tracing_location_t* src_loc);
+
+// Begins an external zone using the given source information.
+// The provided strings will be copied into the tracy buffer.
+void iree_hal_cuda2_tracing_zone_begin_external_impl(
+ iree_hal_cuda2_tracing_context_t* context, CUstream stream,
+ const char* file_name, size_t file_name_length, uint32_t line,
+ const char* function_name, size_t function_name_length, const char* name,
+ size_t name_length);
+
+void iree_hal_cuda2_tracing_zone_end_impl(
+ iree_hal_cuda2_tracing_context_t* context, CUstream stream);
+
+// Begins a new zone with the parent function name.
+#define IREE_CUDA_TRACE_ZONE_BEGIN(context, stream) \
+ static const iree_tracing_location_t TracyConcat( \
+ __tracy_source_location, __LINE__) = {name_literal, __FUNCTION__, \
+ __FILE__, (uint32_t)__LINE__, 0}; \
+ iree_hal_cuda2_tracing_zone_begin_impl( \
+ context, stream, &TracyConcat(__tracy_source_location, __LINE__));
+
+// Begins an externally defined zone with a dynamic source location.
+// The |file_name|, |function_name|, and optional |name| strings will be copied
+// into the trace buffer and do not need to persist.
+#define IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL( \
+ context, stream, file_name, file_name_length, line, function_name, \
+ function_name_length, name, name_length) \
+ iree_hal_cuda2_tracing_zone_begin_external_impl( \
+ context, stream, file_name, file_name_length, line, function_name, \
+ function_name_length, name, name_length)
+
+// Ends the current zone. Must be passed the |zone_id| from the _BEGIN.
+#define IREE_CUDA_TRACE_ZONE_END(context, stream) \
+ iree_hal_cuda2_tracing_zone_end_impl(context, stream)
+
+#else
+
+#define IREE_CUDA_TRACE_ZONE_BEGIN(context, stream)
+#define IREE_CUDA_TRACE_ZONE_BEGIN_EXTERNAL( \
+ context, stream, file_name, file_name_length, line, function_name, \
+ function_name_length, name, name_length)
+#define IREE_CUDA_TRACE_ZONE_END(context, stream)
+
+#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // EXPERIMENTAL_CUDA2_TRACING_H_