Blame - third_party/nccl/nccl.h - 3p/openxla/iree

blob: 4af8e1df4d4109b72147c47bd22170d586eb281a [file] [log] [blame]

Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	1	/*************************************************************************
				2	* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
				3	*
				4	* See LICENSE.txt for license information
				5	************************************************************************/
				6
				7	#ifndef NCCL_H_
				8	#define NCCL_H_
				9
				10	#include <cuda_runtime.h>
				11	#include <cuda_fp16.h>
				12	#if CUDART_VERSION >= 11000
				13	#include <cuda_bf16.h>
				14	#endif
				15
				16	#define NCCL_MAJOR 2
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	17	#define NCCL_MINOR 18
				18	#define NCCL_PATCH 1
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	19	#define NCCL_SUFFIX ""
				20
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	21	#define NCCL_VERSION_CODE 21801
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	22	#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
				23
				24	#ifdef __cplusplus
				25	extern "C" {
				26	#endif
				27
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	28	#include <limits.h>
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	29	/* Opaque handle to communicator */
				30	typedef struct ncclComm* ncclComm_t;
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	31	#define NCCL_COMM_NULL NULL
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	32
				33	#define NCCL_UNIQUE_ID_BYTES 128
				34	typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
				35
				36	/* Error type */
				37	typedef enum { ncclSuccess = 0,
				38	ncclUnhandledCudaError = 1,
				39	ncclSystemError = 2,
				40	ncclInternalError = 3,
				41	ncclInvalidArgument = 4,
				42	ncclInvalidUsage = 5,
				43	ncclRemoteError = 6,
				44	ncclInProgress = 7,
				45	ncclNumResults = 8 } ncclResult_t;
				46
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	47	#define NCCL_CONFIG_UNDEF_INT INT_MIN
				48	#define NCCL_CONFIG_UNDEF_PTR NULL
				49	#define NCCL_SPLIT_NOCOLOR -1
				50
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	51	/* Communicator configuration. Users can assign value to attributes to specify the
				52	* behavior of a communicator. */
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	53	typedef struct ncclConfig_v21700 {
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	54	/* attributes that users should never touch. */
				55	size_t size;
				56	unsigned int magic;
				57	unsigned int version;
				58	/* attributes that users are able to customize. */
				59	int blocking;
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	60	int cgaClusterSize;
				61	int minCTAs;
				62	int maxCTAs;
				63	const char *netName;
				64	int splitShare;
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	65	} ncclConfig_t;
				66
				67	/* Config initializer must be assigned to initialize config structure when it is created.
				68	* Not initialized config will result in NCCL error. */
				69	#define NCCL_CONFIG_INITIALIZER { \
				70	sizeof(ncclConfig_t), /* size */ \
				71	0xcafebeef, /* magic */ \
				72	NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	73	NCCL_CONFIG_UNDEF_INT, /* blocking */ \
				74	NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
				75	NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
				76	NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
				77	NCCL_CONFIG_UNDEF_PTR, /* netName */ \
				78	NCCL_CONFIG_UNDEF_INT /* splitShare */ \
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	79	}
				80
				81	/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
				82	* This integer is coded with the MAJOR, MINOR and PATCH level of the
				83	* NCCL library
				84	*/
				85	ncclResult_t ncclGetVersion(int *version);
				86	ncclResult_t pncclGetVersion(int *version);
				87
				88	/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
				89	* called once and the Id should be distributed to all ranks in the
				90	* communicator before calling ncclCommInitRank. */
				91	ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
				92	ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
				93
				94	/* Create a new communicator (multi thread/process version) with a configuration
				95	* set by users. */
				96	ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
				97	ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
				98
				99	/* Creates a new communicator (multi thread/process version).
				100	* rank must be between 0 and nranks-1 and unique within a communicator clique.
				101	* Each rank is associated to a CUDA device, which has to be set before calling
				102	* ncclCommInitRank.
				103	* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
				104	* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
				105	ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
				106	ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
				107
				108	/* Creates a clique of communicators (single process version).
				109	* This is a convenience function to create a single-process communicator clique.
				110	* Returns an array of ndev newly initialized communicators in comm.
				111	* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
				112	* If devlist is NULL, the first ndev CUDA devices are used.
				113	* Order of devlist defines user-order of processors within the communicator. */
				114	ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
				115	ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
				116
				117	/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
				118	* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
				119	* when the communicator is globally quiescent and related resources are freed; then,
				120	* calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
				121	* itself) without blocking. */
				122	ncclResult_t ncclCommFinalize(ncclComm_t comm);
				123	ncclResult_t pncclCommFinalize(ncclComm_t comm);
				124
				125	/* Frees local resources associated with communicator object. */
				126	ncclResult_t ncclCommDestroy(ncclComm_t comm);
				127	ncclResult_t pncclCommDestroy(ncclComm_t comm);
				128
				129	/* Frees resources associated with communicator object and aborts any operations
				130	* that might still be running on the device. */
				131	ncclResult_t ncclCommAbort(ncclComm_t comm);
				132	ncclResult_t pncclCommAbort(ncclComm_t comm);
				133
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	134	/* Creates one or more communicators from an existing one.
				135	* Ranks with the same color will end up in the same communicator.
				136	* Within the new communicator, key will be used to order ranks.
				137	* NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
				138	* and will therefore return a NULL communicator.
				139	* If config is NULL, the new communicator will inherit the original communicator's
				140	* configuration*/
				141	ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t newcomm, ncclConfig_t config);
				142	ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t newcomm, ncclConfig_t config);
				143
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	144	/* Returns a string for each error code. */
				145	const char* ncclGetErrorString(ncclResult_t result);
				146	const char* pncclGetErrorString(ncclResult_t result);
				147
				148	/* Returns a human-readable message of the last error that occurred.
				149	* comm is currently unused and can be set to NULL
				150	*/
				151	const char* ncclGetLastError(ncclComm_t comm);
Ben Vanik	dd977b1	2023-05-11 14:25:43 -0700	[diff] [blame]	152	const char* pncclGetLastError(ncclComm_t comm);
Okwan Kwon	c31c8a0	2023-01-13 16:36:40 -0800	[diff] [blame]	153
				154	/* Checks whether the comm has encountered any asynchronous errors */
				155	ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
				156	ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
				157
				158	/* Gets the number of ranks in the communicator clique. */
				159	ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
				160	ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
				161
				162	/* Returns the cuda device number associated with the communicator. */
				163	ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
				164	ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
				165
				166	/* Returns the user-ordered "rank" associated with the communicator. */
				167	ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
				168	ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
				169
				170	/* Reduction operation selector */
				171	typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
				172	typedef enum { ncclSum = 0,
				173	ncclProd = 1,
				174	ncclMax = 2,
				175	ncclMin = 3,
				176	ncclAvg = 4,
				177	/* ncclNumOps: The number of built-in ncclRedOp_t values. Also
				178	* serves as the least possible value for dynamic ncclRedOp_t's
				179	* as constructed by ncclRedOpCreate*** functions. */
				180	ncclNumOps = 5,
				181	/* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
				182	* It is defined to be the largest signed value (since compilers
				183	* are permitted to use signed enums) that won't grow
				184	* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
				185	* maintain ABI compatibility. */
				186	ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
				187	} ncclRedOp_t;
				188
				189	/* Data types */
				190	typedef enum { ncclInt8 = 0, ncclChar = 0,
				191	ncclUint8 = 1,
				192	ncclInt32 = 2, ncclInt = 2,
				193	ncclUint32 = 3,
				194	ncclInt64 = 4,
				195	ncclUint64 = 5,
				196	ncclFloat16 = 6, ncclHalf = 6,
				197	ncclFloat32 = 7, ncclFloat = 7,
				198	ncclFloat64 = 8, ncclDouble = 8,
				199	#if defined(__CUDA_BF16_TYPES_EXIST__)
				200	ncclBfloat16 = 9,
				201	ncclNumTypes = 10
				202	#else
				203	ncclNumTypes = 9
				204	#endif
				205	} ncclDataType_t;
				206
				207	/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
				208	typedef enum {
				209	/* ncclScalarDevice: The scalar is in device-visible memory and will be
				210	* dereferenced while the collective is running. */
				211	ncclScalarDevice = 0,
				212
				213	/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
				214	* dereferenced before the ncclRedOpCreate**() function returns. /
				215	ncclScalarHostImmediate = 1
				216	} ncclScalarResidence_t;
				217
				218	/*
				219	* ncclRedOpCreatePreMulSum
				220	*
				221	* Creates a new reduction operator which pre-multiplies input values by a given
				222	* scalar locally before reducing them with peer values via summation. For use
				223	* only with collectives launched against comm and datatype. The
				224	* residence argument indicates how/when the memory pointed to by scalar
				225	* will be dereferenced. Upon return, the newly created operator's handle
				226	* is stored in op.
				227	*/
				228	ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t op, void scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
				229	ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t op, void scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
				230
				231	/*
				232	* ncclRedOpDestroy
				233	*
				234	* Destroys the reduction operator op. The operator must have been created by
				235	* ncclRedOpCreatePreMul with the matching communicator comm. An operator may be
				236	* destroyed as soon as the last NCCL function which is given that operator returns.
				237	*/
				238	ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
				239	ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
				240
				241	/*
				242	* Collective communication operations
				243	*
				244	* Collective communication operations must be called separately for each
				245	* communicator in a communicator clique.
				246	*
				247	* They return when operations have been enqueued on the CUDA stream.
				248	*
				249	* Since they may perform inter-CPU synchronization, each call has to be done
				250	* from a different thread or process, or need to use Group Semantics (see
				251	* below).
				252	*/
				253
				254	/*
				255	* Reduce
				256	*
				257	* Reduces data arrays of length count in sendbuff into recvbuff using op
				258	* operation.
				259	* recvbuff may be NULL on all calls except for root device.
				260	* root is the rank (not the CUDA device) where data will reside after the
				261	* operation is complete.
				262	*
				263	* In-place operation will happen if sendbuff == recvbuff.
				264	*/
				265	ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
				266	ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
				267	ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
				268	ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
				269
				270	/*
				271	* (deprecated) Broadcast (in-place)
				272	*
				273	* Copies count values from root to all other devices.
				274	* root is the rank (not the CUDA device) where data resides before the
				275	* operation is started.
				276	*
				277	* This operation is implicitely in place.
				278	*/
				279	ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
				280	ncclComm_t comm, cudaStream_t stream);
				281	ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
				282	ncclComm_t comm, cudaStream_t stream);
				283
				284	/*
				285	* Broadcast
				286	*
				287	* Copies count values from root to all other devices.
				288	* root is the rank (not the CUDA device) where data resides before the
				289	* operation is started.
				290	*
				291	* In-place operation will happen if sendbuff == recvbuff.
				292	*/
				293	ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
				294	ncclComm_t comm, cudaStream_t stream);
				295	ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
				296	ncclComm_t comm, cudaStream_t stream);
				297
				298	/*
				299	* All-Reduce
				300	*
				301	* Reduces data arrays of length count in sendbuff using op operation, and
				302	* leaves identical copies of result on each recvbuff.
				303	*
				304	* In-place operation will happen if sendbuff == recvbuff.
				305	*/
				306	ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
				307	ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
				308	ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
				309	ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
				310
				311	/*
				312	* Reduce-Scatter
				313	*
				314	* Reduces data in sendbuff using op operation and leaves reduced result
				315	* scattered over the devices so that recvbuff on rank i will contain the i-th
				316	* block of the result.
				317	* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
				318	* should have a size of at least nranks*recvcount elements.
				319	*
				320	* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
				321	*/
				322	ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
				323	size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
				324	cudaStream_t stream);
				325	ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
				326	size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
				327	cudaStream_t stream);
				328
				329	/*
				330	* All-Gather
				331	*
				332	* Each device gathers sendcount values from other GPUs into recvbuff,
				333	* receiving data from rank i at offset i*sendcount.
				334	* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
				335	* should have a size of at least nranks*sendcount elements.
				336	*
				337	* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
				338	*/
				339	ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
				340	ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
				341	ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
				342	ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
				343
				344	/*
				345	* Send
				346	*
				347	* Send data from sendbuff to rank peer.
				348	*
				349	* Rank peer needs to call ncclRecv with the same datatype and the same count from this
				350	* rank.
				351	*
				352	* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
				353	* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
				354	* ncclGroupEnd section.
				355	*/
				356	ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
				357	ncclComm_t comm, cudaStream_t stream);
				358	ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
				359	ncclComm_t comm, cudaStream_t stream);
				360
				361	/*
				362	* Receive
				363	*
				364	* Receive data from rank peer into recvbuff.
				365	*
				366	* Rank peer needs to call ncclSend with the same datatype and the same count to this
				367	* rank.
				368	*
				369	* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
				370	* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
				371	* ncclGroupEnd section.
				372	*/
				373	ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
				374	ncclComm_t comm, cudaStream_t stream);
				375	ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
				376	ncclComm_t comm, cudaStream_t stream);
				377
				378	/*
				379	* Group semantics
				380	*
				381	* When managing multiple GPUs from a single thread, and since NCCL collective
				382	* calls may perform inter-CPU synchronization, we need to "group" calls for
				383	* different ranks/devices into a single call.
				384	*
				385	* Grouping NCCL calls as being part of the same collective operation is done
				386	* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
				387	* collective calls until the ncclGroupEnd call, which will wait for all calls
				388	* to be complete. Note that for collective communication, ncclGroupEnd only
				389	* guarantees that the operations are enqueued on the streams, not that
				390	* the operation is effectively done.
				391	*
				392	* Both collective communication and ncclCommInitRank can be used in conjunction
				393	* of ncclGroupStart/ncclGroupEnd, but not together.
				394	*
				395	* Group semantics also allow to fuse multiple operations on the same device
				396	* to improve performance (for aggregated collective calls), or to permit
				397	* concurrent progress of multiple send/receive operations.
				398	*/
				399
				400	/*
				401	* Group Start
				402	*
				403	* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
				404	* a single NCCL operation. Nothing will be started on the CUDA stream until
				405	* ncclGroupEnd.
				406	*/
				407	ncclResult_t ncclGroupStart();
				408	ncclResult_t pncclGroupStart();
				409
				410	/*
				411	* Group End
				412	*
				413	* End a group call. Start a fused NCCL operation consisting of all calls since
				414	* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
				415	* need to be called after ncclGroupEnd.
				416	*/
				417	ncclResult_t ncclGroupEnd();
				418	ncclResult_t pncclGroupEnd();
				419
				420	#ifdef __cplusplus
				421	} // end extern "C"
				422	#endif
				423
				424	#endif // end include guard