experimental/cuda2/event_pool.c - 3p/openxla/iree - Git at Google

 // Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "experimental/cuda2/event_pool.h"

 #include <stdbool.h>
 #include <stddef.h>
 #include <string.h>

 #include "experimental/cuda2/cuda_dynamic_symbols.h"
 #include "experimental/cuda2/cuda_status_util.h"
 #include "iree/base/api.h"
 #include "iree/base/internal/atomics.h"
 #include "iree/base/internal/synchronization.h"
 #include "iree/hal/api.h"

 //===----------------------------------------------------------------------===//
 // iree_hal_cuda2_event_t
 //===----------------------------------------------------------------------===//

 struct iree_hal_cuda2_event_t {
   // The allocator used to create the event.
   iree_allocator_t host_allocator;
   // The symbols used to create and destroy CUevent objects.
   const iree_hal_cuda2_dynamic_symbols_t* symbols;

   // The event pool that owns this event. This cannot be NULL.
   iree_hal_cuda2_event_pool_t* pool;
   // The underlying CUevent object.
   CUevent cu_event;

   // A reference count used to manage resource lifetime. Its value range:
   // * 1 - when inside the event pool and to be acquired;
   // * >= 1 - when acquired outside of the event pool;
   // * 0 - when before releasing back to the pool or destruction.
   iree_atomic_ref_count_t ref_count;
 };

 CUevent iree_hal_cuda2_event_handle(const iree_hal_cuda2_event_t* event) {
   return event->cu_event;
 }

 static inline void iree_hal_cuda2_event_destroy(iree_hal_cuda2_event_t* event) {
   iree_allocator_t host_allocator = event->host_allocator;
   const iree_hal_cuda2_dynamic_symbols_t* symbols = event->symbols;
   IREE_TRACE_ZONE_BEGIN(z0);

   IREE_ASSERT_REF_COUNT_ZERO(&event->ref_count);
   IREE_CUDA_IGNORE_ERROR(symbols, cuEventDestroy(event->cu_event));
   iree_allocator_free(host_allocator, event);

   IREE_TRACE_ZONE_END(z0);
 }

 static inline iree_status_t iree_hal_cuda2_event_create(
     const iree_hal_cuda2_dynamic_symbols_t* symbols,
     iree_hal_cuda2_event_pool_t* pool, iree_allocator_t host_allocator,
     iree_hal_cuda2_event_t** out_event) {
   IREE_ASSERT_ARGUMENT(symbols);
   IREE_ASSERT_ARGUMENT(pool);
   IREE_ASSERT_ARGUMENT(out_event);
   *out_event = NULL;
   IREE_TRACE_ZONE_BEGIN(z0);

   iree_hal_cuda2_event_t* event = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0,
       iree_allocator_malloc(host_allocator, sizeof(*event), (void**)&event));
   event->host_allocator = host_allocator;
   event->symbols = symbols;
   event->pool = pool;
   event->cu_event = NULL;
   iree_atomic_ref_count_init(&event->ref_count);  // -> 1

   iree_status_t status = IREE_CURESULT_TO_STATUS(
       symbols, cuEventCreate(&event->cu_event, CU_EVENT_DISABLE_TIMING),
       "cuEventCreate");
   if (iree_status_is_ok(status)) {
     *out_event = event;
   } else {
     iree_atomic_ref_count_dec(&event->ref_count);  // -> 0
     iree_hal_cuda2_event_destroy(event);
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 void iree_hal_cuda2_event_retain(iree_hal_cuda2_event_t* event) {
   iree_atomic_ref_count_inc(&event->ref_count);
 }

 static void iree_hal_cuda2_event_pool_release(
     iree_hal_cuda2_event_pool_t* event_pool, iree_host_size_t event_count,
     iree_hal_cuda2_event_t** events);

 void iree_hal_cuda2_event_release(iree_hal_cuda2_event_t* event) {
   if (iree_atomic_ref_count_dec(&event->ref_count) == 1) {
     // Release back to the pool if the reference count becomes 0.
     iree_hal_cuda2_event_pool_release(event->pool, 1, &event);
   }
 }

 //===----------------------------------------------------------------------===//
 // iree_hal_cuda2_event_pool_t
 //===----------------------------------------------------------------------===//

 struct iree_hal_cuda2_event_pool_t {
   // The allocator used to create the event pool.
   iree_allocator_t host_allocator;
   // The symbols used to create and destroy CUevent objects.
   const iree_hal_cuda2_dynamic_symbols_t* symbols;

   // Guards event related fields in the pool. We don't expect a performant
   // program to frequently allocate events for synchronization purposes; the
   // traffic to this pool should be low. So it should be fine to use mutex to
   // guard here.
   iree_slim_mutex_t event_mutex;

   // Maximum number of event objects that will be maintained in the pool.
   // More events may be allocated at any time, but they will be disposed
   // directly when they are no longer needed.
   iree_host_size_t available_capacity IREE_GUARDED_BY(event_mutex);
   // Total number of currently available event objects.
   iree_host_size_t available_count IREE_GUARDED_BY(event_mutex);
   // The list of available_count event objects.
   iree_hal_cuda2_event_t* available_list[] IREE_GUARDED_BY(event_mutex);
 };
 // + Additional inline allocation for holding events up to the capacity.

 iree_status_t iree_hal_cuda2_event_pool_allocate(
     const iree_hal_cuda2_dynamic_symbols_t* symbols,
     iree_host_size_t available_capacity, iree_allocator_t host_allocator,
     iree_hal_cuda2_event_pool_t** out_event_pool) {
   IREE_ASSERT_ARGUMENT(symbols);
   IREE_ASSERT_ARGUMENT(out_event_pool);
   *out_event_pool = NULL;
   IREE_TRACE_ZONE_BEGIN(z0);

   iree_hal_cuda2_event_pool_t* event_pool = NULL;
   iree_host_size_t total_size =
       sizeof(*event_pool) +
       available_capacity * sizeof(*event_pool->available_list);
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0,
       iree_allocator_malloc(host_allocator, total_size, (void**)&event_pool));
   event_pool->host_allocator = host_allocator;
   event_pool->symbols = symbols;
   iree_slim_mutex_initialize(&event_pool->event_mutex);
   event_pool->available_capacity = available_capacity;
   event_pool->available_count = 0;

   iree_status_t status = iree_ok_status();
   for (iree_host_size_t i = 0; i < available_capacity; ++i) {
     status = iree_hal_cuda2_event_create(
         symbols, event_pool, host_allocator,
         &event_pool->available_list[event_pool->available_count++]);
     if (!iree_status_is_ok(status)) break;
   }

   if (iree_status_is_ok(status)) {
     *out_event_pool = event_pool;
   } else {
     iree_hal_cuda2_event_pool_free(event_pool);
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 void iree_hal_cuda2_event_pool_free(iree_hal_cuda2_event_pool_t* event_pool) {
   iree_allocator_t host_allocator = event_pool->host_allocator;
   IREE_TRACE_ZONE_BEGIN(z0);

   for (iree_host_size_t i = 0; i < event_pool->available_count; ++i) {
     iree_hal_cuda2_event_t* event = event_pool->available_list[i];
     iree_atomic_ref_count_dec(&event->ref_count);  // -> 0
     iree_hal_cuda2_event_destroy(event);
   }
   iree_slim_mutex_deinitialize(&event_pool->event_mutex);
   iree_allocator_free(host_allocator, event_pool);

   IREE_TRACE_ZONE_END(z0);
 }

 iree_status_t iree_hal_cuda2_event_pool_acquire(
     iree_hal_cuda2_event_pool_t* event_pool, iree_host_size_t event_count,
     iree_hal_cuda2_event_t** out_events) {
   IREE_ASSERT_ARGUMENT(event_pool);
   if (!event_count) return iree_ok_status();
   IREE_ASSERT_ARGUMENT(out_events);
   IREE_TRACE_ZONE_BEGIN(z0);

   // We'll try to get what we can from the pool and fall back to initializing
   // new iree_hal_cuda2_event_t objects.
   iree_host_size_t remaining_count = event_count;

   // Try first to grab from the pool.
   iree_slim_mutex_lock(&event_pool->event_mutex);
   iree_host_size_t from_pool_count =
       iree_min(event_pool->available_count, event_count);
   if (from_pool_count > 0) {
     iree_host_size_t pool_base_index =
         event_pool->available_count - from_pool_count;
     memcpy(out_events, &event_pool->available_list[pool_base_index],
            from_pool_count * sizeof(*event_pool->available_list));
     event_pool->available_count -= from_pool_count;
     remaining_count -= from_pool_count;
   }
   iree_slim_mutex_unlock(&event_pool->event_mutex);

   // Allocate the rest of the events.
   if (remaining_count > 0) {
     IREE_TRACE_ZONE_BEGIN_NAMED(z1, "event-pool-unpooled-acquire");
     iree_status_t status = iree_ok_status();
     for (iree_host_size_t i = 0; i < remaining_count; ++i) {
       status = iree_hal_cuda2_event_create(event_pool->symbols, event_pool,
                                            event_pool->host_allocator,
                                            &out_events[from_pool_count + i]);
       if (!iree_status_is_ok(status)) {
         // Must release all events we've acquired so far.
         iree_hal_cuda2_event_pool_release(event_pool, from_pool_count + i,
                                           out_events);
         IREE_TRACE_ZONE_END(z1);
         IREE_TRACE_ZONE_END(z0);
         return status;
       }
     }
     IREE_TRACE_ZONE_END(z1);
   }

   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }

 static void iree_hal_cuda2_event_pool_release(
     iree_hal_cuda2_event_pool_t* event_pool, iree_host_size_t event_count,
     iree_hal_cuda2_event_t** events) {
   IREE_ASSERT_ARGUMENT(event_pool);
   if (!event_count) return;
   IREE_ASSERT_ARGUMENT(events);
   IREE_TRACE_ZONE_BEGIN(z0);

   // We'll try to release all we can back to the pool and then deinitialize
   // the ones that won't fit.
   iree_host_size_t remaining_count = event_count;

   // Try first to release to the pool.
   iree_slim_mutex_lock(&event_pool->event_mutex);
   iree_host_size_t to_pool_count =
       iree_min(event_pool->available_capacity - event_pool->available_count,
                event_count);
   if (to_pool_count > 0) {
     for (iree_host_size_t i = 0; i < to_pool_count; ++i) {
       IREE_ASSERT_REF_COUNT_ZERO(&events[i]->ref_count);
       iree_hal_cuda2_event_retain(events[i]);  // -> 1
     }
     iree_host_size_t pool_base_index = event_pool->available_count;
     memcpy(&event_pool->available_list[pool_base_index], events,
            to_pool_count * sizeof(*event_pool->available_list));
     event_pool->available_count += to_pool_count;
     remaining_count -= to_pool_count;
   }
   iree_slim_mutex_unlock(&event_pool->event_mutex);

   // Deallocate the rest of the events. We don't bother resetting them as we are
   // getting rid of them.
   if (remaining_count > 0) {
     IREE_TRACE_ZONE_BEGIN_NAMED(z1, "event-pool-unpooled-release");
     for (iree_host_size_t i = 0; i < remaining_count; ++i) {
       iree_hal_cuda2_event_destroy(events[to_pool_count + i]);
     }
     IREE_TRACE_ZONE_END(z1);
   }
   IREE_TRACE_ZONE_END(z0);
 }
	// Copyright 2023 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "experimental/cuda2/event_pool.h"

	#include <stdbool.h>
	#include <stddef.h>
	#include <string.h>

	#include "experimental/cuda2/cuda_dynamic_symbols.h"
	#include "experimental/cuda2/cuda_status_util.h"
	#include "iree/base/api.h"
	#include "iree/base/internal/atomics.h"
	#include "iree/base/internal/synchronization.h"
	#include "iree/hal/api.h"

	//===----------------------------------------------------------------------===//
	// iree_hal_cuda2_event_t
	//===----------------------------------------------------------------------===//

	struct iree_hal_cuda2_event_t {
	// The allocator used to create the event.
	iree_allocator_t host_allocator;
	// The symbols used to create and destroy CUevent objects.
	const iree_hal_cuda2_dynamic_symbols_t* symbols;

	// The event pool that owns this event. This cannot be NULL.
	iree_hal_cuda2_event_pool_t* pool;
	// The underlying CUevent object.
	CUevent cu_event;

	// A reference count used to manage resource lifetime. Its value range:
	// * 1 - when inside the event pool and to be acquired;
	// * >= 1 - when acquired outside of the event pool;
	// * 0 - when before releasing back to the pool or destruction.
	iree_atomic_ref_count_t ref_count;
	};

	CUevent iree_hal_cuda2_event_handle(const iree_hal_cuda2_event_t* event) {
	return event->cu_event;
	}

	static inline void iree_hal_cuda2_event_destroy(iree_hal_cuda2_event_t* event) {
	iree_allocator_t host_allocator = event->host_allocator;
	const iree_hal_cuda2_dynamic_symbols_t* symbols = event->symbols;
	IREE_TRACE_ZONE_BEGIN(z0);

	IREE_ASSERT_REF_COUNT_ZERO(&event->ref_count);
	IREE_CUDA_IGNORE_ERROR(symbols, cuEventDestroy(event->cu_event));
	iree_allocator_free(host_allocator, event);

	IREE_TRACE_ZONE_END(z0);
	}

	static inline iree_status_t iree_hal_cuda2_event_create(
	const iree_hal_cuda2_dynamic_symbols_t* symbols,
	iree_hal_cuda2_event_pool_t* pool, iree_allocator_t host_allocator,
	iree_hal_cuda2_event_t** out_event) {
	IREE_ASSERT_ARGUMENT(symbols);
	IREE_ASSERT_ARGUMENT(pool);
	IREE_ASSERT_ARGUMENT(out_event);
	*out_event = NULL;
	IREE_TRACE_ZONE_BEGIN(z0);

	iree_hal_cuda2_event_t* event = NULL;
	IREE_RETURN_AND_END_ZONE_IF_ERROR(
	z0,
	iree_allocator_malloc(host_allocator, sizeof(event), (void*)&event));
	event->host_allocator = host_allocator;
	event->symbols = symbols;
	event->pool = pool;
	event->cu_event = NULL;
	iree_atomic_ref_count_init(&event->ref_count); // -> 1

	iree_status_t status = IREE_CURESULT_TO_STATUS(
	symbols, cuEventCreate(&event->cu_event, CU_EVENT_DISABLE_TIMING),
	"cuEventCreate");
	if (iree_status_is_ok(status)) {
	*out_event = event;
	} else {
	iree_atomic_ref_count_dec(&event->ref_count); // -> 0
	iree_hal_cuda2_event_destroy(event);
	}

	IREE_TRACE_ZONE_END(z0);
	return status;
	}

	void iree_hal_cuda2_event_retain(iree_hal_cuda2_event_t* event) {
	iree_atomic_ref_count_inc(&event->ref_count);
	}

	static void iree_hal_cuda2_event_pool_release(
	iree_hal_cuda2_event_pool_t* event_pool, iree_host_size_t event_count,
	iree_hal_cuda2_event_t** events);

	void iree_hal_cuda2_event_release(iree_hal_cuda2_event_t* event) {
	if (iree_atomic_ref_count_dec(&event->ref_count) == 1) {
	// Release back to the pool if the reference count becomes 0.
	iree_hal_cuda2_event_pool_release(event->pool, 1, &event);
	}
	}

	//===----------------------------------------------------------------------===//
	// iree_hal_cuda2_event_pool_t
	//===----------------------------------------------------------------------===//

	struct iree_hal_cuda2_event_pool_t {
	// The allocator used to create the event pool.
	iree_allocator_t host_allocator;
	// The symbols used to create and destroy CUevent objects.
	const iree_hal_cuda2_dynamic_symbols_t* symbols;

	// Guards event related fields in the pool. We don't expect a performant
	// program to frequently allocate events for synchronization purposes; the
	// traffic to this pool should be low. So it should be fine to use mutex to
	// guard here.
	iree_slim_mutex_t event_mutex;

	// Maximum number of event objects that will be maintained in the pool.
	// More events may be allocated at any time, but they will be disposed
	// directly when they are no longer needed.
	iree_host_size_t available_capacity IREE_GUARDED_BY(event_mutex);
	// Total number of currently available event objects.
	iree_host_size_t available_count IREE_GUARDED_BY(event_mutex);
	// The list of available_count event objects.
	iree_hal_cuda2_event_t* available_list[] IREE_GUARDED_BY(event_mutex);
	};
	// + Additional inline allocation for holding events up to the capacity.

	iree_status_t iree_hal_cuda2_event_pool_allocate(
	const iree_hal_cuda2_dynamic_symbols_t* symbols,
	iree_host_size_t available_capacity, iree_allocator_t host_allocator,
	iree_hal_cuda2_event_pool_t** out_event_pool) {
	IREE_ASSERT_ARGUMENT(symbols);
	IREE_ASSERT_ARGUMENT(out_event_pool);
	*out_event_pool = NULL;
	IREE_TRACE_ZONE_BEGIN(z0);

	iree_hal_cuda2_event_pool_t* event_pool = NULL;
	iree_host_size_t total_size =
	sizeof(*event_pool) +
	available_capacity * sizeof(*event_pool->available_list);
	IREE_RETURN_AND_END_ZONE_IF_ERROR(
	z0,
	iree_allocator_malloc(host_allocator, total_size, (void**)&event_pool));
	event_pool->host_allocator = host_allocator;
	event_pool->symbols = symbols;
	iree_slim_mutex_initialize(&event_pool->event_mutex);
	event_pool->available_capacity = available_capacity;
	event_pool->available_count = 0;

	iree_status_t status = iree_ok_status();
	for (iree_host_size_t i = 0; i < available_capacity; ++i) {
	status = iree_hal_cuda2_event_create(
	symbols, event_pool, host_allocator,
	&event_pool->available_list[event_pool->available_count++]);
	if (!iree_status_is_ok(status)) break;
	}

	if (iree_status_is_ok(status)) {
	*out_event_pool = event_pool;
	} else {
	iree_hal_cuda2_event_pool_free(event_pool);
	}
	IREE_TRACE_ZONE_END(z0);
	return status;
	}

	void iree_hal_cuda2_event_pool_free(iree_hal_cuda2_event_pool_t* event_pool) {
	iree_allocator_t host_allocator = event_pool->host_allocator;
	IREE_TRACE_ZONE_BEGIN(z0);

	for (iree_host_size_t i = 0; i < event_pool->available_count; ++i) {
	iree_hal_cuda2_event_t* event = event_pool->available_list[i];
	iree_atomic_ref_count_dec(&event->ref_count); // -> 0
	iree_hal_cuda2_event_destroy(event);
	}
	iree_slim_mutex_deinitialize(&event_pool->event_mutex);
	iree_allocator_free(host_allocator, event_pool);

	IREE_TRACE_ZONE_END(z0);
	}

	iree_status_t iree_hal_cuda2_event_pool_acquire(
	iree_hal_cuda2_event_pool_t* event_pool, iree_host_size_t event_count,
	iree_hal_cuda2_event_t** out_events) {
	IREE_ASSERT_ARGUMENT(event_pool);
	if (!event_count) return iree_ok_status();
	IREE_ASSERT_ARGUMENT(out_events);
	IREE_TRACE_ZONE_BEGIN(z0);

	// We'll try to get what we can from the pool and fall back to initializing
	// new iree_hal_cuda2_event_t objects.
	iree_host_size_t remaining_count = event_count;

	// Try first to grab from the pool.
	iree_slim_mutex_lock(&event_pool->event_mutex);
	iree_host_size_t from_pool_count =
	iree_min(event_pool->available_count, event_count);
	if (from_pool_count > 0) {
	iree_host_size_t pool_base_index =
	event_pool->available_count - from_pool_count;
	memcpy(out_events, &event_pool->available_list[pool_base_index],
	from_pool_count * sizeof(*event_pool->available_list));
	event_pool->available_count -= from_pool_count;
	remaining_count -= from_pool_count;
	}
	iree_slim_mutex_unlock(&event_pool->event_mutex);

	// Allocate the rest of the events.
	if (remaining_count > 0) {
	IREE_TRACE_ZONE_BEGIN_NAMED(z1, "event-pool-unpooled-acquire");
	iree_status_t status = iree_ok_status();
	for (iree_host_size_t i = 0; i < remaining_count; ++i) {
	status = iree_hal_cuda2_event_create(event_pool->symbols, event_pool,
	event_pool->host_allocator,
	&out_events[from_pool_count + i]);
	if (!iree_status_is_ok(status)) {
	// Must release all events we've acquired so far.
	iree_hal_cuda2_event_pool_release(event_pool, from_pool_count + i,
	out_events);
	IREE_TRACE_ZONE_END(z1);
	IREE_TRACE_ZONE_END(z0);
	return status;
	}
	}
	IREE_TRACE_ZONE_END(z1);
	}

	IREE_TRACE_ZONE_END(z0);
	return iree_ok_status();
	}

	static void iree_hal_cuda2_event_pool_release(
	iree_hal_cuda2_event_pool_t* event_pool, iree_host_size_t event_count,
	iree_hal_cuda2_event_t** events) {
	IREE_ASSERT_ARGUMENT(event_pool);
	if (!event_count) return;
	IREE_ASSERT_ARGUMENT(events);
	IREE_TRACE_ZONE_BEGIN(z0);

	// We'll try to release all we can back to the pool and then deinitialize
	// the ones that won't fit.
	iree_host_size_t remaining_count = event_count;

	// Try first to release to the pool.
	iree_slim_mutex_lock(&event_pool->event_mutex);
	iree_host_size_t to_pool_count =
	iree_min(event_pool->available_capacity - event_pool->available_count,
	event_count);
	if (to_pool_count > 0) {
	for (iree_host_size_t i = 0; i < to_pool_count; ++i) {
	IREE_ASSERT_REF_COUNT_ZERO(&events[i]->ref_count);
	iree_hal_cuda2_event_retain(events[i]); // -> 1
	}
	iree_host_size_t pool_base_index = event_pool->available_count;
	memcpy(&event_pool->available_list[pool_base_index], events,
	to_pool_count * sizeof(*event_pool->available_list));
	event_pool->available_count += to_pool_count;
	remaining_count -= to_pool_count;
	}
	iree_slim_mutex_unlock(&event_pool->event_mutex);

	// Deallocate the rest of the events. We don't bother resetting them as we are
	// getting rid of them.
	if (remaining_count > 0) {
	IREE_TRACE_ZONE_BEGIN_NAMED(z1, "event-pool-unpooled-release");
	for (iree_host_size_t i = 0; i < remaining_count; ++i) {
	iree_hal_cuda2_event_destroy(events[to_pool_count + i]);
	}
	IREE_TRACE_ZONE_END(z1);
	}
	IREE_TRACE_ZONE_END(z0);
	}