experimental/cuda2/api.h - 3p/openxla/iree - Git at Google

 // Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 // See iree/base/api.h for documentation on the API conventions used.

 #ifndef IREE_EXPERIMENTAL_CUDA2_API_H_
 #define IREE_EXPERIMENTAL_CUDA2_API_H_

 #include "iree/base/api.h"
 #include "iree/hal/api.h"

 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus

 //===----------------------------------------------------------------------===//
 // iree_hal_cuda2_device_t
 //===----------------------------------------------------------------------===//

 // ncclUniqueId exposed without exporting the NCCL headers.
 typedef struct {
   char data[128];
 } iree_hal_cuda2_nccl_id_t;

 // Parameters defining a CUmemoryPool.
 typedef struct iree_hal_cuda2_memory_pool_params_t {
   // Minimum number of bytes to keep in the pool when trimming with
   // iree_hal_device_trim.
   uint64_t minimum_capacity;
   // Soft maximum number of bytes to keep in the pool.
   // When more than this is allocated the extra will be freed at the next
   // device synchronization in order to remain under the threshold.
   uint64_t release_threshold;
   // TODO: per-device access permissions array.
 } iree_hal_cuda2_memory_pool_params_t;

 // Parameters for each CUmemoryPool used for queue-ordered allocations.
 typedef struct iree_hal_cuda2_memory_pooling_params_t {
   // Used exclusively for DEVICE_LOCAL allocations.
   iree_hal_cuda2_memory_pool_params_t device_local;
   // Used for any host-visible/host-local memory types.
   iree_hal_cuda2_memory_pool_params_t other;
 } iree_hal_cuda2_memory_pooling_params_t;

 // Parameters configuring an iree_hal_cuda2_device_t.
 // Must be initialized with iree_hal_cuda2_device_params_initialize prior to
 // use.
 typedef struct iree_hal_cuda2_device_params_t {
   // Number of queues exposed on the device.
   // Each queue acts as a separate synchronization scope where all work executes
   // concurrently unless prohibited by semaphores.
   iree_host_size_t queue_count;

   // Total size of each block in the device shared block pool.
   // Larger sizes will lower overhead and ensure the heap isn't hit for
   // transient allocations while also increasing memory consumption.
   iree_host_size_t arena_block_size;

   // Enables tracing of command buffers when IREE tracing is enabled.
   // May take advantage of additional extensions for more accurate timing or
   // hardware-specific performance counters.
   //
   // NOTE: tracing has a non-trivial overhead and will skew the timing of
   // submissions and introduce false barriers between dispatches. Use this to
   // identify slow dispatches and refine from there; be wary of whole-program
   // tracing with this enabled.
   bool stream_tracing;

   // Whether to use async allocations even if reported as available by the
   // device. Defaults to true when the device supports it.
   bool async_allocations;

   // Parameters for each CUmemoryPool used for queue-ordered allocations.
   iree_hal_cuda2_memory_pooling_params_t memory_pools;
 } iree_hal_cuda2_device_params_t;

 // Initializes |out_params| to default values.
 IREE_API_EXPORT void iree_hal_cuda2_device_params_initialize(
     iree_hal_cuda2_device_params_t* out_params);

 //===----------------------------------------------------------------------===//
 // iree_hal_cuda2_driver_t
 //===----------------------------------------------------------------------===//

 // CUDA HAL driver creation options.
 typedef struct iree_hal_cuda2_driver_options_t {
   // The index of the default CUDA device to use within the list of available
   // devices.
   int default_device_index;
 } iree_hal_cuda2_driver_options_t;

 // Initializes the given |out_options| with default driver creation options.
 IREE_API_EXPORT void iree_hal_cuda2_driver_options_initialize(
     iree_hal_cuda2_driver_options_t* out_options);

 // Creates a CUDA HAL driver with the given |options|, from which CUDA devices
 // can be enumerated and created with specific parameters.
 //
 // |out_driver| must be released by the caller (see iree_hal_driver_release).
 IREE_API_EXPORT iree_status_t iree_hal_cuda2_driver_create(
     iree_string_view_t identifier,
     const iree_hal_cuda2_driver_options_t* options,
     const iree_hal_cuda2_device_params_t* default_params,
     iree_allocator_t host_allocator, iree_hal_driver_t** out_driver);

 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus

 #endif  // IREE_EXPERIMENTAL_CUDA2_API_H_
	// Copyright 2023 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	// See iree/base/api.h for documentation on the API conventions used.

	#ifndef IREE_EXPERIMENTAL_CUDA2_API_H_
	#define IREE_EXPERIMENTAL_CUDA2_API_H_

	#include "iree/base/api.h"
	#include "iree/hal/api.h"

	#ifdef __cplusplus
	extern "C" {
	#endif // __cplusplus

	//===----------------------------------------------------------------------===//
	// iree_hal_cuda2_device_t
	//===----------------------------------------------------------------------===//

	// ncclUniqueId exposed without exporting the NCCL headers.
	typedef struct {
	char data[128];
	} iree_hal_cuda2_nccl_id_t;

	// Parameters defining a CUmemoryPool.
	typedef struct iree_hal_cuda2_memory_pool_params_t {
	// Minimum number of bytes to keep in the pool when trimming with
	// iree_hal_device_trim.
	uint64_t minimum_capacity;
	// Soft maximum number of bytes to keep in the pool.
	// When more than this is allocated the extra will be freed at the next
	// device synchronization in order to remain under the threshold.
	uint64_t release_threshold;
	// TODO: per-device access permissions array.
	} iree_hal_cuda2_memory_pool_params_t;

	// Parameters for each CUmemoryPool used for queue-ordered allocations.
	typedef struct iree_hal_cuda2_memory_pooling_params_t {
	// Used exclusively for DEVICE_LOCAL allocations.
	iree_hal_cuda2_memory_pool_params_t device_local;
	// Used for any host-visible/host-local memory types.
	iree_hal_cuda2_memory_pool_params_t other;
	} iree_hal_cuda2_memory_pooling_params_t;

	// Parameters configuring an iree_hal_cuda2_device_t.
	// Must be initialized with iree_hal_cuda2_device_params_initialize prior to
	// use.
	typedef struct iree_hal_cuda2_device_params_t {
	// Number of queues exposed on the device.
	// Each queue acts as a separate synchronization scope where all work executes
	// concurrently unless prohibited by semaphores.
	iree_host_size_t queue_count;

	// Total size of each block in the device shared block pool.
	// Larger sizes will lower overhead and ensure the heap isn't hit for
	// transient allocations while also increasing memory consumption.
	iree_host_size_t arena_block_size;

	// Enables tracing of command buffers when IREE tracing is enabled.
	// May take advantage of additional extensions for more accurate timing or
	// hardware-specific performance counters.
	//
	// NOTE: tracing has a non-trivial overhead and will skew the timing of
	// submissions and introduce false barriers between dispatches. Use this to
	// identify slow dispatches and refine from there; be wary of whole-program
	// tracing with this enabled.
	bool stream_tracing;

	// Whether to use async allocations even if reported as available by the
	// device. Defaults to true when the device supports it.
	bool async_allocations;

	// Parameters for each CUmemoryPool used for queue-ordered allocations.
	iree_hal_cuda2_memory_pooling_params_t memory_pools;
	} iree_hal_cuda2_device_params_t;

	// Initializes \|out_params\| to default values.
	IREE_API_EXPORT void iree_hal_cuda2_device_params_initialize(
	iree_hal_cuda2_device_params_t* out_params);

	//===----------------------------------------------------------------------===//
	// iree_hal_cuda2_driver_t
	//===----------------------------------------------------------------------===//

	// CUDA HAL driver creation options.
	typedef struct iree_hal_cuda2_driver_options_t {
	// The index of the default CUDA device to use within the list of available
	// devices.
	int default_device_index;
	} iree_hal_cuda2_driver_options_t;

	// Initializes the given \|out_options\| with default driver creation options.
	IREE_API_EXPORT void iree_hal_cuda2_driver_options_initialize(
	iree_hal_cuda2_driver_options_t* out_options);

	// Creates a CUDA HAL driver with the given \|options\|, from which CUDA devices
	// can be enumerated and created with specific parameters.
	//
	// \|out_driver\| must be released by the caller (see iree_hal_driver_release).
	IREE_API_EXPORT iree_status_t iree_hal_cuda2_driver_create(
	iree_string_view_t identifier,
	const iree_hal_cuda2_driver_options_t* options,
	const iree_hal_cuda2_device_params_t* default_params,
	iree_allocator_t host_allocator, iree_hal_driver_t** out_driver);

	#ifdef __cplusplus
	} // extern "C"
	#endif // __cplusplus

	#endif // IREE_EXPERIMENTAL_CUDA2_API_H_