blob: fbfb32f9e144509e56159137fd295be2c667590d [file] [log] [blame]
// Copyright 2023 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "experimental/rocm/tracing.h"
#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE
#include "experimental/rocm/status_util.h"
// Total number of events per tracing context. This translates to the maximum
// number of outstanding timestamp queries before collection is required.
// To prevent spilling pages we leave some room for the context structure.
#define IREE_HAL_ROCM_TRACING_DEFAULT_QUERY_CAPACITY (16 * 1024 - 256)
struct iree_hal_rocm_tracing_context_t {
iree_hal_rocm_context_wrapper_t* rocm_context;
hipStream_t stream;
iree_arena_block_pool_t* block_pool;
iree_allocator_t host_allocator;
// A unique GPU zone ID allocated from Tracy.
// There is a global limit of 255 GPU zones (ID 255 is special).
uint8_t id;
// Base event used for computing relative times for all recorded events.
// This is required as ROCM (without CUPTI) only allows for relative timing
// between events and we need a stable base event.
hipEvent_t base_event;
// Indices into |event_pool| defining a ringbuffer.
uint32_t query_head;
uint32_t query_tail;
uint32_t query_capacity;
// Event pool reused to capture tracing timestamps.
hipEvent_t event_pool[IREE_HAL_ROCM_TRACING_DEFAULT_QUERY_CAPACITY];
};
static iree_status_t iree_hal_rocm_tracing_context_initial_calibration(
iree_hal_rocm_context_wrapper_t* rocm_context, hipStream_t stream,
hipEvent_t base_event, int64_t* out_cpu_timestamp,
int64_t* out_gpu_timestamp, float* out_timestamp_period) {
IREE_TRACE_ZONE_BEGIN(z0);
*out_cpu_timestamp = 0;
*out_gpu_timestamp = 0;
*out_timestamp_period = 1.0f;
// Record event to the stream; in the absence of a synchronize this may not
// flush immediately.
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, ROCM_RESULT_TO_STATUS(rocm_context->syms,
hipEventRecord(base_event, stream)));
// Force flush the event and wait for it to complete.
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, ROCM_RESULT_TO_STATUS(rocm_context->syms,
hipEventSynchronize(base_event)));
// Track when we know the event has completed and has a reasonable timestamp.
// This may drift from the actual time differential between host/device but is
// (maybe?) the best we can do without CUPTI.
*out_cpu_timestamp = iree_tracing_time();
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
iree_status_t iree_hal_rocm_tracing_context_allocate(
iree_hal_rocm_context_wrapper_t* rocm_context,
iree_string_view_t queue_name, hipStream_t stream,
iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
iree_hal_rocm_tracing_context_t** out_context) {
IREE_TRACE_ZONE_BEGIN(z0);
IREE_ASSERT_ARGUMENT(rocm_context);
IREE_ASSERT_ARGUMENT(stream);
IREE_ASSERT_ARGUMENT(block_pool);
IREE_ASSERT_ARGUMENT(out_context);
*out_context = NULL;
iree_hal_rocm_tracing_context_t* context = NULL;
iree_status_t status =
iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context);
if (iree_status_is_ok(status)) {
context->rocm_context = rocm_context;
context->stream = stream;
context->block_pool = block_pool;
context->host_allocator = host_allocator;
context->query_capacity = IREE_ARRAYSIZE(context->event_pool);
}
// Pre-allocate all events in the event pool.
if (iree_status_is_ok(status)) {
IREE_TRACE_ZONE_BEGIN_NAMED(
z_event_pool, "iree_hal_rocm_tracing_context_allocate_event_pool");
IREE_TRACE_ZONE_APPEND_VALUE_I64(z_event_pool,
(int64_t)context->query_capacity);
for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
status = ROCM_RESULT_TO_STATUS(rocm_context->syms,
hipEventCreate(&context->event_pool[i]));
if (!iree_status_is_ok(status)) break;
}
IREE_TRACE_ZONE_END(z_event_pool);
}
// Create the initial GPU event and insert it into the stream.
// All events we record are relative to this event.
int64_t cpu_timestamp = 0;
int64_t gpu_timestamp = 0;
float timestamp_period = 0.0f;
if (iree_status_is_ok(status)) {
status = ROCM_RESULT_TO_STATUS(rocm_context->syms,
hipEventCreate(&context->base_event));
}
if (iree_status_is_ok(status)) {
status = iree_hal_rocm_tracing_context_initial_calibration(
rocm_context, stream, context->base_event, &cpu_timestamp,
&gpu_timestamp, &timestamp_period);
}
// Allocate the GPU context and pass initial calibration data.
if (iree_status_is_ok(status)) {
context->id = iree_tracing_gpu_context_allocate(
IREE_TRACING_GPU_CONTEXT_TYPE_VULKAN, queue_name.data, queue_name.size,
/*is_calibrated=*/false, cpu_timestamp, gpu_timestamp,
timestamp_period);
}
if (iree_status_is_ok(status)) {
*out_context = context;
} else {
iree_hal_rocm_tracing_context_free(context);
}
IREE_TRACE_ZONE_END(z0);
return status;
}
void iree_hal_rocm_tracing_context_free(
iree_hal_rocm_tracing_context_t* context) {
if (!context) return;
IREE_TRACE_ZONE_BEGIN(z0);
// Always perform a collection on shutdown.
iree_hal_rocm_tracing_context_collect(context);
// Release all events; since collection completed they should all be unused.
IREE_TRACE_ZONE_BEGIN_NAMED(z_event_pool,
"iree_hal_rocm_tracing_context_free_event_pool");
for (iree_host_size_t i = 0; i < context->query_capacity; ++i) {
if (context->event_pool[i]) {
ROCM_IGNORE_ERROR(context->rocm_context->syms,
hipEventDestroy(context->event_pool[i]));
}
}
IREE_TRACE_ZONE_END(z_event_pool);
if (context->base_event) {
ROCM_IGNORE_ERROR(context->rocm_context->syms,
hipEventDestroy(context->base_event));
}
iree_allocator_t host_allocator = context->host_allocator;
iree_allocator_free(host_allocator, context);
IREE_TRACE_ZONE_END(z0);
}
void iree_hal_rocm_tracing_context_collect(
iree_hal_rocm_tracing_context_t* context) {
if (!context) return;
if (context->query_tail == context->query_head) {
// No outstanding queries.
return;
}
IREE_TRACE_ZONE_BEGIN(z0);
iree_hal_rocm_dynamic_symbols_t* syms = context->rocm_context->syms;
while (context->query_tail != context->query_head) {
// Compute the contiguous range of queries ready to be read.
// If the ringbuffer wraps around we'll handle that in the next loop.
uint32_t try_query_count =
context->query_head < context->query_tail
? context->query_capacity - context->query_tail
: context->query_head - context->query_tail;
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)try_query_count);
// Scan and feed the times to tracy, stopping when we hit the first
// unavailable query.
uint32_t query_base = context->query_tail;
uint32_t read_query_count = 0;
for (uint32_t i = 0; i < try_query_count; ++i) {
// Ensure the event has completed; will return hipErrorNotReady if
// recorded but not retired or any other deferred error.
uint16_t query_id = (uint16_t)(query_base + i);
hipEvent_t query_event = context->event_pool[query_id];
hipError_t result = syms->hipEventQuery(query_event);
if (result != hipSuccess) break;
// Calculate context-relative time and notify tracy.
float relative_millis = 0.0f;
ROCM_IGNORE_ERROR(
syms, hipEventElapsedTime(&relative_millis, context->base_event,
query_event));
int64_t gpu_timestamp = (int64_t)((double)relative_millis * 1000000.0);
iree_tracing_gpu_zone_notify(context->id, query_id, gpu_timestamp);
read_query_count = i + 1;
}
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)read_query_count);
context->query_tail += read_query_count;
if (context->query_tail >= context->query_capacity) {
context->query_tail = 0;
}
}
IREE_TRACE_ZONE_END(z0);
}
static uint16_t iree_hal_rocm_tracing_context_insert_query(
iree_hal_rocm_tracing_context_t* context, hipStream_t stream) {
// Allocate an event from the pool for use by the query.
uint32_t query_id = context->query_head;
context->query_head = (context->query_head + 1) % context->query_capacity;
// TODO: check to see if the read and write heads of the ringbuffer have
// overlapped. If they have we could try to collect but it's not guaranteed
// that collection will complete (e.g. we may be reserving events for use in
// graphs that haven't yet been launched).
//
// For now we just allow the overlap and tracing results will be inconsistent.
IREE_ASSERT_NE(context->query_head, context->query_tail);
hipEvent_t event = context->event_pool[query_id];
ROCM_IGNORE_ERROR(context->rocm_context->syms, hipEventRecord(event, stream));
return query_id;
}
// TODO: optimize this implementation to reduce the number of events required:
// today we insert 2 events per zone (one for begin and one for end) but in
// many cases we could reduce this by inserting events only between zones and
// using the differences between them.
void iree_hal_rocm_tracing_zone_begin_impl(
iree_hal_rocm_tracing_context_t* context, hipStream_t stream,
const iree_tracing_location_t* src_loc) {
if (!context) return;
uint16_t query_id =
iree_hal_rocm_tracing_context_insert_query(context, stream);
iree_tracing_gpu_zone_begin(context->id, query_id, src_loc);
}
void iree_hal_rocm_tracing_zone_begin_external_impl(
iree_hal_rocm_tracing_context_t* context, hipStream_t stream,
const char* file_name, size_t file_name_length, uint32_t line,
const char* function_name, size_t function_name_length, const char* name,
size_t name_length) {
if (!context) return;
uint16_t query_id =
iree_hal_rocm_tracing_context_insert_query(context, stream);
iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name,
file_name_length, line, function_name,
function_name_length, name, name_length);
}
void iree_hal_rocm_tracing_zone_end_impl(
iree_hal_rocm_tracing_context_t* context, hipStream_t stream) {
if (!context) return;
uint16_t query_id =
iree_hal_rocm_tracing_context_insert_query(context, stream);
iree_tracing_gpu_zone_end(context->id, query_id);
}
#else
iree_status_t iree_hal_rocm_tracing_context_allocate(
iree_hal_rocm_context_wrapper_t* rocm_context,
iree_string_view_t queue_name, hipStream_t stream,
iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
iree_hal_rocm_tracing_context_t** out_context) {
*out_context = NULL;
return iree_ok_status();
}
void iree_hal_rocm_tracing_context_free(
iree_hal_rocm_tracing_context_t* context) {}
void iree_hal_rocm_tracing_context_collect(
iree_hal_rocm_tracing_context_t* context) {}
#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE