third_party/nccl/nccl.h - 3p/openxla/iree - Git at Google

 /*************************************************************************
  * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/

 #ifndef NCCL_H_
 #define NCCL_H_

 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 #if CUDART_VERSION >= 11000
 #include <cuda_bf16.h>
 #endif

 #define NCCL_MAJOR 2
 #define NCCL_MINOR 18
 #define NCCL_PATCH 1
 #define NCCL_SUFFIX ""

 #define NCCL_VERSION_CODE 21801
 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))

 #ifdef __cplusplus
 extern "C" {
 #endif

 #include <limits.h>
 /* Opaque handle to communicator */
 typedef struct ncclComm* ncclComm_t;
 #define NCCL_COMM_NULL NULL

 #define NCCL_UNIQUE_ID_BYTES 128
 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;

 /* Error type */
 typedef enum { ncclSuccess                 =  0,
                ncclUnhandledCudaError      =  1,
                ncclSystemError             =  2,
                ncclInternalError           =  3,
                ncclInvalidArgument         =  4,
                ncclInvalidUsage            =  5,
                ncclRemoteError             =  6,
                ncclInProgress              =  7,
                ncclNumResults              =  8 } ncclResult_t;

 #define NCCL_CONFIG_UNDEF_INT INT_MIN
 #define NCCL_CONFIG_UNDEF_PTR NULL
 #define NCCL_SPLIT_NOCOLOR -1

 /* Communicator configuration. Users can assign value to attributes to specify the
  * behavior of a communicator. */
 typedef struct ncclConfig_v21700 {
   /* attributes that users should never touch. */
   size_t size;
   unsigned int magic;
   unsigned int version;
   /* attributes that users are able to customize. */
   int blocking;
   int cgaClusterSize;
   int minCTAs;
   int maxCTAs;
   const char *netName;
   int splitShare;
 } ncclConfig_t;

 /* Config initializer must be assigned to initialize config structure when it is created.
  * Not initialized config will result in NCCL error. */
 #define NCCL_CONFIG_INITIALIZER {                                       \
   sizeof(ncclConfig_t), /* size */                                      \
   0xcafebeef,           /* magic */                                     \
   NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
   NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
   NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
   NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
   NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
   NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
   NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
 }

 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
  * This integer is coded with the MAJOR, MINOR and PATCH level of the
  * NCCL library
  */
 ncclResult_t  ncclGetVersion(int *version);
 ncclResult_t pncclGetVersion(int *version);

 /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
  * called once and the Id should be distributed to all ranks in the
  * communicator before calling ncclCommInitRank. */
 ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
 ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);

 /* Create a new communicator (multi thread/process version) with a configuration
  * set by users. */
 ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
 ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);

 /* Creates a new communicator (multi thread/process version).
  * rank must be between 0 and nranks-1 and unique within a communicator clique.
  * Each rank is associated to a CUDA device, which has to be set before calling
  * ncclCommInitRank.
  * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
  * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
 ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);

 /* Creates a clique of communicators (single process version).
  * This is a convenience function to create a single-process communicator clique.
  * Returns an array of ndev newly initialized communicators in comm.
  * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
  * If devlist is NULL, the first ndev CUDA devices are used.
  * Order of devlist defines user-order of processors within the communicator. */
 ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);

 /* Finalize a communicator. ncclCommFinalize flushes all issued communications,
  * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
  * when the communicator is globally quiescent and related resources are freed; then,
  * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
  * itself) without blocking. */
 ncclResult_t  ncclCommFinalize(ncclComm_t comm);
 ncclResult_t pncclCommFinalize(ncclComm_t comm);

 /* Frees local resources associated with communicator object. */
 ncclResult_t  ncclCommDestroy(ncclComm_t comm);
 ncclResult_t pncclCommDestroy(ncclComm_t comm);

 /* Frees resources associated with communicator object and aborts any operations
  * that might still be running on the device. */
 ncclResult_t  ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);

 /* Creates one or more communicators from an existing one.
  * Ranks with the same color will end up in the same communicator.
  * Within the new communicator, key will be used to order ranks.
  * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
  * and will therefore return a NULL communicator.
  * If config is NULL, the new communicator will inherit the original communicator's
  * configuration*/
 ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
 ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);

 /* Returns a string for each error code. */
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);

 /* Returns a human-readable message of the last error that occurred.
  * comm is currently unused and can be set to NULL
  */
 const char*  ncclGetLastError(ncclComm_t comm);
 const char* pncclGetLastError(ncclComm_t comm);

 /* Checks whether the comm has encountered any asynchronous errors */
 ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);

 /* Gets the number of ranks in the communicator clique. */
 ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
 ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);

 /* Returns the cuda device number associated with the communicator. */
 ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
 ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);

 /* Returns the user-ordered "rank" associated with the communicator. */
 ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
 ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);

 /* Reduction operation selector */
 typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
 typedef enum { ncclSum        = 0,
                ncclProd       = 1,
                ncclMax        = 2,
                ncclMin        = 3,
                ncclAvg        = 4,
                /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
                 * serves as the least possible value for dynamic ncclRedOp_t's
                 * as constructed by ncclRedOpCreate*** functions. */
                ncclNumOps     = 5,
                /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
                 * It is defined to be the largest signed value (since compilers
                 * are permitted to use signed enums) that won't grow
                 * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
                 * maintain ABI compatibility. */
                ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
              } ncclRedOp_t;

 /* Data types */
 typedef enum { ncclInt8       = 0, ncclChar       = 0,
                ncclUint8      = 1,
                ncclInt32      = 2, ncclInt        = 2,
                ncclUint32     = 3,
                ncclInt64      = 4,
                ncclUint64     = 5,
                ncclFloat16    = 6, ncclHalf       = 6,
                ncclFloat32    = 7, ncclFloat      = 7,
                ncclFloat64    = 8, ncclDouble     = 8,
 #if defined(__CUDA_BF16_TYPES_EXIST__)
                ncclBfloat16   = 9,
                ncclNumTypes   = 10
 #else
                ncclNumTypes   = 9
 #endif
 } ncclDataType_t;

 /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
 typedef enum {
   /* ncclScalarDevice: The scalar is in device-visible memory and will be
    * dereferenced while the collective is running. */
   ncclScalarDevice = 0,

   /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
    * dereferenced before the ncclRedOpCreate***() function returns. */
   ncclScalarHostImmediate = 1
 } ncclScalarResidence_t;

 /*
  * ncclRedOpCreatePreMulSum
  *
  * Creates a new reduction operator which pre-multiplies input values by a given
  * scalar locally before reducing them with peer values via summation. For use
  * only with collectives launched against *comm* and *datatype*. The
  * *residence* argument indicates how/when the memory pointed to by *scalar*
  * will be dereferenced. Upon return, the newly created operator's handle
  * is stored in *op*.
  */
 ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
 ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);

 /*
  * ncclRedOpDestroy
  *
  * Destroys the reduction operator *op*. The operator must have been created by
  * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
  * destroyed as soon as the last NCCL function which is given that operator returns.
  */
 ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
 ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);

 /*
  * Collective communication operations
  *
  * Collective communication operations must be called separately for each
  * communicator in a communicator clique.
  *
  * They return when operations have been enqueued on the CUDA stream.
  *
  * Since they may perform inter-CPU synchronization, each call has to be done
  * from a different thread or process, or need to use Group Semantics (see
  * below).
  */

 /*
  * Reduce
  *
  * Reduces data arrays of length count in sendbuff into recvbuff using op
  * operation.
  * recvbuff may be NULL on all calls except for root device.
  * root is the rank (not the CUDA device) where data will reside after the
  * operation is complete.
  *
  * In-place operation will happen if sendbuff == recvbuff.
  */
 ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
     ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
     ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);

 /*
  * (deprecated) Broadcast (in-place)
  *
  * Copies count values from root to all other devices.
  * root is the rank (not the CUDA device) where data resides before the
  * operation is started.
  *
  * This operation is implicitely in place.
  */
 ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);

 /*
  * Broadcast
  *
  * Copies count values from root to all other devices.
  * root is the rank (not the CUDA device) where data resides before the
  * operation is started.
  *
  * In-place operation will happen if sendbuff == recvbuff.
  */
 ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);

 /*
  * All-Reduce
  *
  * Reduces data arrays of length count in sendbuff using op operation, and
  * leaves identical copies of result on each recvbuff.
  *
  * In-place operation will happen if sendbuff == recvbuff.
  */
 ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);

 /*
  * Reduce-Scatter
  *
  * Reduces data in sendbuff using op operation and leaves reduced result
  * scattered over the devices so that recvbuff on rank i will contain the i-th
  * block of the result.
  * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
  * should have a size of at least nranks*recvcount elements.
  *
  * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
  */
 ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
     cudaStream_t stream);
 ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
     cudaStream_t stream);

 /*
  * All-Gather
  *
  * Each device gathers sendcount values from other GPUs into recvbuff,
  * receiving data from rank i at offset i*sendcount.
  * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
  * should have a size of at least nranks*sendcount elements.
  *
  * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
  */
 ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);

 /*
  * Send
  *
  * Send data from sendbuff to rank peer.
  *
  * Rank peer needs to call ncclRecv with the same datatype and the same count from this
  * rank.
  *
  * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
  * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
  * ncclGroupEnd section.
  */
 ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);

 /*
  * Receive
  *
  * Receive data from rank peer into recvbuff.
  *
  * Rank peer needs to call ncclSend with the same datatype and the same count to this
  * rank.
  *
  * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
  * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
  * ncclGroupEnd section.
  */
 ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);

 /*
  * Group semantics
  *
  * When managing multiple GPUs from a single thread, and since NCCL collective
  * calls may perform inter-CPU synchronization, we need to "group" calls for
  * different ranks/devices into a single call.
  *
  * Grouping NCCL calls as being part of the same collective operation is done
  * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
  * collective calls until the ncclGroupEnd call, which will wait for all calls
  * to be complete. Note that for collective communication, ncclGroupEnd only
  * guarantees that the operations are enqueued on the streams, not that
  * the operation is effectively done.
  *
  * Both collective communication and ncclCommInitRank can be used in conjunction
  * of ncclGroupStart/ncclGroupEnd, but not together.
  *
  * Group semantics also allow to fuse multiple operations on the same device
  * to improve performance (for aggregated collective calls), or to permit
  * concurrent progress of multiple send/receive operations.
  */

 /*
  * Group Start
  *
  * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
  * a single NCCL operation. Nothing will be started on the CUDA stream until
  * ncclGroupEnd.
  */
 ncclResult_t  ncclGroupStart();
 ncclResult_t pncclGroupStart();

 /*
  * Group End
  *
  * End a group call. Start a fused NCCL operation consisting of all calls since
  * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
  * need to be called after ncclGroupEnd.
  */
 ncclResult_t  ncclGroupEnd();
 ncclResult_t pncclGroupEnd();

 #ifdef __cplusplus
 } // end extern "C"
 #endif

 #endif // end include guard
	/*************************************************************************
	* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
	*
	* See LICENSE.txt for license information
	************************************************************************/

	#ifndef NCCL_H_
	#define NCCL_H_

	#include <cuda_runtime.h>
	#include <cuda_fp16.h>
	#if CUDART_VERSION >= 11000
	#include <cuda_bf16.h>
	#endif

	#define NCCL_MAJOR 2
	#define NCCL_MINOR 18
	#define NCCL_PATCH 1
	#define NCCL_SUFFIX ""

	#define NCCL_VERSION_CODE 21801
	#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))

	#ifdef __cplusplus
	extern "C" {
	#endif

	#include <limits.h>
	/* Opaque handle to communicator */
	typedef struct ncclComm* ncclComm_t;
	#define NCCL_COMM_NULL NULL

	#define NCCL_UNIQUE_ID_BYTES 128
	typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;

	/* Error type */
	typedef enum { ncclSuccess = 0,
	ncclUnhandledCudaError = 1,
	ncclSystemError = 2,
	ncclInternalError = 3,
	ncclInvalidArgument = 4,
	ncclInvalidUsage = 5,
	ncclRemoteError = 6,
	ncclInProgress = 7,
	ncclNumResults = 8 } ncclResult_t;

	#define NCCL_CONFIG_UNDEF_INT INT_MIN
	#define NCCL_CONFIG_UNDEF_PTR NULL
	#define NCCL_SPLIT_NOCOLOR -1

	/* Communicator configuration. Users can assign value to attributes to specify the
	* behavior of a communicator. */
	typedef struct ncclConfig_v21700 {
	/* attributes that users should never touch. */
	size_t size;
	unsigned int magic;
	unsigned int version;
	/* attributes that users are able to customize. */
	int blocking;
	int cgaClusterSize;
	int minCTAs;
	int maxCTAs;
	const char *netName;
	int splitShare;
	} ncclConfig_t;

	/* Config initializer must be assigned to initialize config structure when it is created.
	* Not initialized config will result in NCCL error. */
	#define NCCL_CONFIG_INITIALIZER { \
	sizeof(ncclConfig_t), /* size */ \
	0xcafebeef, /* magic */ \
	NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
	NCCL_CONFIG_UNDEF_INT, /* blocking */ \
	NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
	NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
	NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
	NCCL_CONFIG_UNDEF_PTR, /* netName */ \
	NCCL_CONFIG_UNDEF_INT /* splitShare */ \
	}

	/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
	* This integer is coded with the MAJOR, MINOR and PATCH level of the
	* NCCL library
	*/
	ncclResult_t ncclGetVersion(int *version);
	ncclResult_t pncclGetVersion(int *version);

	/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
	* called once and the Id should be distributed to all ranks in the
	* communicator before calling ncclCommInitRank. */
	ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
	ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);

	/* Create a new communicator (multi thread/process version) with a configuration
	* set by users. */
	ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
	ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);

	/* Creates a new communicator (multi thread/process version).
	* rank must be between 0 and nranks-1 and unique within a communicator clique.
	* Each rank is associated to a CUDA device, which has to be set before calling
	* ncclCommInitRank.
	* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
	* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
	ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
	ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);

	/* Creates a clique of communicators (single process version).
	* This is a convenience function to create a single-process communicator clique.
	* Returns an array of ndev newly initialized communicators in comm.
	* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
	* If devlist is NULL, the first ndev CUDA devices are used.
	* Order of devlist defines user-order of processors within the communicator. */
	ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
	ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);

	/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
	* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
	* when the communicator is globally quiescent and related resources are freed; then,
	* calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
	* itself) without blocking. */
	ncclResult_t ncclCommFinalize(ncclComm_t comm);
	ncclResult_t pncclCommFinalize(ncclComm_t comm);

	/* Frees local resources associated with communicator object. */
	ncclResult_t ncclCommDestroy(ncclComm_t comm);
	ncclResult_t pncclCommDestroy(ncclComm_t comm);

	/* Frees resources associated with communicator object and aborts any operations
	* that might still be running on the device. */
	ncclResult_t ncclCommAbort(ncclComm_t comm);
	ncclResult_t pncclCommAbort(ncclComm_t comm);

	/* Creates one or more communicators from an existing one.
	* Ranks with the same color will end up in the same communicator.
	* Within the new communicator, key will be used to order ranks.
	* NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
	* and will therefore return a NULL communicator.
	* If config is NULL, the new communicator will inherit the original communicator's
	* configuration*/
	ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t newcomm, ncclConfig_t config);
	ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t newcomm, ncclConfig_t config);

	/* Returns a string for each error code. */
	const char* ncclGetErrorString(ncclResult_t result);
	const char* pncclGetErrorString(ncclResult_t result);

	/* Returns a human-readable message of the last error that occurred.
	* comm is currently unused and can be set to NULL
	*/
	const char* ncclGetLastError(ncclComm_t comm);
	const char* pncclGetLastError(ncclComm_t comm);

	/* Checks whether the comm has encountered any asynchronous errors */
	ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
	ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);

	/* Gets the number of ranks in the communicator clique. */
	ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
	ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);

	/* Returns the cuda device number associated with the communicator. */
	ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
	ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);

	/* Returns the user-ordered "rank" associated with the communicator. */
	ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
	ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);

	/* Reduction operation selector */
	typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
	typedef enum { ncclSum = 0,
	ncclProd = 1,
	ncclMax = 2,
	ncclMin = 3,
	ncclAvg = 4,
	/* ncclNumOps: The number of built-in ncclRedOp_t values. Also
	* serves as the least possible value for dynamic ncclRedOp_t's
	* as constructed by ncclRedOpCreate*** functions. */
	ncclNumOps = 5,
	/* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
	* It is defined to be the largest signed value (since compilers
	* are permitted to use signed enums) that won't grow
	* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
	* maintain ABI compatibility. */
	ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
	} ncclRedOp_t;

	/* Data types */
	typedef enum { ncclInt8 = 0, ncclChar = 0,
	ncclUint8 = 1,
	ncclInt32 = 2, ncclInt = 2,
	ncclUint32 = 3,
	ncclInt64 = 4,
	ncclUint64 = 5,
	ncclFloat16 = 6, ncclHalf = 6,
	ncclFloat32 = 7, ncclFloat = 7,
	ncclFloat64 = 8, ncclDouble = 8,
	#if defined(__CUDA_BF16_TYPES_EXIST__)
	ncclBfloat16 = 9,
	ncclNumTypes = 10
	#else
	ncclNumTypes = 9
	#endif
	} ncclDataType_t;

	/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
	typedef enum {
	/* ncclScalarDevice: The scalar is in device-visible memory and will be
	* dereferenced while the collective is running. */
	ncclScalarDevice = 0,

	/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
	* dereferenced before the ncclRedOpCreate**() function returns. /
	ncclScalarHostImmediate = 1
	} ncclScalarResidence_t;

	/*
	* ncclRedOpCreatePreMulSum
	*
	* Creates a new reduction operator which pre-multiplies input values by a given
	* scalar locally before reducing them with peer values via summation. For use
	* only with collectives launched against comm and datatype. The
	* residence argument indicates how/when the memory pointed to by scalar
	* will be dereferenced. Upon return, the newly created operator's handle
	* is stored in op.
	*/
	ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t op, void scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
	ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t op, void scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);

	/*
	* ncclRedOpDestroy
	*
	* Destroys the reduction operator op. The operator must have been created by
	* ncclRedOpCreatePreMul with the matching communicator comm. An operator may be
	* destroyed as soon as the last NCCL function which is given that operator returns.
	*/
	ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
	ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);

	/*
	* Collective communication operations
	*
	* Collective communication operations must be called separately for each
	* communicator in a communicator clique.
	*
	* They return when operations have been enqueued on the CUDA stream.
	*
	* Since they may perform inter-CPU synchronization, each call has to be done
	* from a different thread or process, or need to use Group Semantics (see
	* below).
	*/

	/*
	* Reduce
	*
	* Reduces data arrays of length count in sendbuff into recvbuff using op
	* operation.
	* recvbuff may be NULL on all calls except for root device.
	* root is the rank (not the CUDA device) where data will reside after the
	* operation is complete.
	*
	* In-place operation will happen if sendbuff == recvbuff.
	*/
	ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
	ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
	ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
	ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);

	/*
	* (deprecated) Broadcast (in-place)
	*
	* Copies count values from root to all other devices.
	* root is the rank (not the CUDA device) where data resides before the
	* operation is started.
	*
	* This operation is implicitely in place.
	*/
	ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
	ncclComm_t comm, cudaStream_t stream);
	ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
	ncclComm_t comm, cudaStream_t stream);

	/*
	* Broadcast
	*
	* Copies count values from root to all other devices.
	* root is the rank (not the CUDA device) where data resides before the
	* operation is started.
	*
	* In-place operation will happen if sendbuff == recvbuff.
	*/
	ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
	ncclComm_t comm, cudaStream_t stream);
	ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
	ncclComm_t comm, cudaStream_t stream);

	/*
	* All-Reduce
	*
	* Reduces data arrays of length count in sendbuff using op operation, and
	* leaves identical copies of result on each recvbuff.
	*
	* In-place operation will happen if sendbuff == recvbuff.
	*/
	ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
	ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
	ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
	ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);

	/*
	* Reduce-Scatter
	*
	* Reduces data in sendbuff using op operation and leaves reduced result
	* scattered over the devices so that recvbuff on rank i will contain the i-th
	* block of the result.
	* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
	* should have a size of at least nranks*recvcount elements.
	*
	* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
	*/
	ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
	size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
	cudaStream_t stream);
	ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
	size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
	cudaStream_t stream);

	/*
	* All-Gather
	*
	* Each device gathers sendcount values from other GPUs into recvbuff,
	* receiving data from rank i at offset i*sendcount.
	* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
	* should have a size of at least nranks*sendcount elements.
	*
	* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
	*/
	ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
	ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
	ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
	ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);

	/*
	* Send
	*
	* Send data from sendbuff to rank peer.
	*
	* Rank peer needs to call ncclRecv with the same datatype and the same count from this
	* rank.
	*
	* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
	* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
	* ncclGroupEnd section.
	*/
	ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
	ncclComm_t comm, cudaStream_t stream);
	ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
	ncclComm_t comm, cudaStream_t stream);

	/*
	* Receive
	*
	* Receive data from rank peer into recvbuff.
	*
	* Rank peer needs to call ncclSend with the same datatype and the same count to this
	* rank.
	*
	* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
	* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
	* ncclGroupEnd section.
	*/
	ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
	ncclComm_t comm, cudaStream_t stream);
	ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
	ncclComm_t comm, cudaStream_t stream);

	/*
	* Group semantics
	*
	* When managing multiple GPUs from a single thread, and since NCCL collective
	* calls may perform inter-CPU synchronization, we need to "group" calls for
	* different ranks/devices into a single call.
	*
	* Grouping NCCL calls as being part of the same collective operation is done
	* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
	* collective calls until the ncclGroupEnd call, which will wait for all calls
	* to be complete. Note that for collective communication, ncclGroupEnd only
	* guarantees that the operations are enqueued on the streams, not that
	* the operation is effectively done.
	*
	* Both collective communication and ncclCommInitRank can be used in conjunction
	* of ncclGroupStart/ncclGroupEnd, but not together.
	*
	* Group semantics also allow to fuse multiple operations on the same device
	* to improve performance (for aggregated collective calls), or to permit
	* concurrent progress of multiple send/receive operations.
	*/

	/*
	* Group Start
	*
	* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
	* a single NCCL operation. Nothing will be started on the CUDA stream until
	* ncclGroupEnd.
	*/
	ncclResult_t ncclGroupStart();
	ncclResult_t pncclGroupStart();

	/*
	* Group End
	*
	* End a group call. Start a fused NCCL operation consisting of all calls since
	* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
	* need to be called after ncclGroupEnd.
	*/
	ncclResult_t ncclGroupEnd();
	ncclResult_t pncclGroupEnd();

	#ifdef __cplusplus
	} // end extern "C"
	#endif

	#endif // end include guard