blob: 5df17c54918fe3c55eb0ab749a1c94adc0c11918 [file] [log] [blame]
// Copyright 2023 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See iree/base/api.h for documentation on the API conventions used.
#ifndef IREE_EXPERIMENTAL_CUDA2_API_H_
#define IREE_EXPERIMENTAL_CUDA2_API_H_
#include "iree/base/api.h"
#include "iree/hal/api.h"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
//===----------------------------------------------------------------------===//
// iree_hal_cuda2_device_t
//===----------------------------------------------------------------------===//
// ncclUniqueId exposed without exporting the NCCL headers.
typedef struct {
char data[128];
} iree_hal_cuda2_nccl_id_t;
// Parameters defining a CUmemoryPool.
typedef struct iree_hal_cuda2_memory_pool_params_t {
// Minimum number of bytes to keep in the pool when trimming with
// iree_hal_device_trim.
uint64_t minimum_capacity;
// Soft maximum number of bytes to keep in the pool.
// When more than this is allocated the extra will be freed at the next
// device synchronization in order to remain under the threshold.
uint64_t release_threshold;
// TODO: per-device access permissions array.
} iree_hal_cuda2_memory_pool_params_t;
// Parameters for each CUmemoryPool used for queue-ordered allocations.
typedef struct iree_hal_cuda2_memory_pooling_params_t {
// Used exclusively for DEVICE_LOCAL allocations.
iree_hal_cuda2_memory_pool_params_t device_local;
// Used for any host-visible/host-local memory types.
iree_hal_cuda2_memory_pool_params_t other;
} iree_hal_cuda2_memory_pooling_params_t;
// Parameters configuring an iree_hal_cuda2_device_t.
// Must be initialized with iree_hal_cuda2_device_params_initialize prior to
// use.
typedef struct iree_hal_cuda2_device_params_t {
// Number of queues exposed on the device.
// Each queue acts as a separate synchronization scope where all work executes
// concurrently unless prohibited by semaphores.
iree_host_size_t queue_count;
// Total size of each block in the device shared block pool.
// Larger sizes will lower overhead and ensure the heap isn't hit for
// transient allocations while also increasing memory consumption.
iree_host_size_t arena_block_size;
// Enables tracing of command buffers when IREE tracing is enabled.
// May take advantage of additional extensions for more accurate timing or
// hardware-specific performance counters.
//
// NOTE: tracing has a non-trivial overhead and will skew the timing of
// submissions and introduce false barriers between dispatches. Use this to
// identify slow dispatches and refine from there; be wary of whole-program
// tracing with this enabled.
bool stream_tracing;
// Whether to use async allocations even if reported as available by the
// device. Defaults to true when the device supports it.
bool async_allocations;
// Parameters for each CUmemoryPool used for queue-ordered allocations.
iree_hal_cuda2_memory_pooling_params_t memory_pools;
} iree_hal_cuda2_device_params_t;
// Initializes |out_params| to default values.
IREE_API_EXPORT void iree_hal_cuda2_device_params_initialize(
iree_hal_cuda2_device_params_t* out_params);
//===----------------------------------------------------------------------===//
// iree_hal_cuda2_driver_t
//===----------------------------------------------------------------------===//
// CUDA HAL driver creation options.
typedef struct iree_hal_cuda2_driver_options_t {
// The index of the default CUDA device to use within the list of available
// devices.
int default_device_index;
} iree_hal_cuda2_driver_options_t;
// Initializes the given |out_options| with default driver creation options.
IREE_API_EXPORT void iree_hal_cuda2_driver_options_initialize(
iree_hal_cuda2_driver_options_t* out_options);
// Creates a CUDA HAL driver with the given |options|, from which CUDA devices
// can be enumerated and created with specific parameters.
//
// |out_driver| must be released by the caller (see iree_hal_driver_release).
IREE_API_EXPORT iree_status_t iree_hal_cuda2_driver_create(
iree_string_view_t identifier,
const iree_hal_cuda2_driver_options_t* options,
const iree_hal_cuda2_device_params_t* default_params,
iree_allocator_t host_allocator, iree_hal_driver_t** out_driver);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // IREE_EXPERIMENTAL_CUDA2_API_H_