blob: 4af8e1df4d4109b72147c47bd22170d586eb281a [file] [log] [blame]
Okwan Kwonc31c8a02023-01-13 16:36:40 -08001/*************************************************************************
2 * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
3 *
4 * See LICENSE.txt for license information
5 ************************************************************************/
6
7#ifndef NCCL_H_
8#define NCCL_H_
9
10#include <cuda_runtime.h>
11#include <cuda_fp16.h>
12#if CUDART_VERSION >= 11000
13#include <cuda_bf16.h>
14#endif
15
16#define NCCL_MAJOR 2
Ben Vanikdd977b12023-05-11 14:25:43 -070017#define NCCL_MINOR 18
18#define NCCL_PATCH 1
Okwan Kwonc31c8a02023-01-13 16:36:40 -080019#define NCCL_SUFFIX ""
20
Ben Vanikdd977b12023-05-11 14:25:43 -070021#define NCCL_VERSION_CODE 21801
Okwan Kwonc31c8a02023-01-13 16:36:40 -080022#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
23
24#ifdef __cplusplus
25extern "C" {
26#endif
27
Ben Vanikdd977b12023-05-11 14:25:43 -070028#include <limits.h>
Okwan Kwonc31c8a02023-01-13 16:36:40 -080029/* Opaque handle to communicator */
30typedef struct ncclComm* ncclComm_t;
Ben Vanikdd977b12023-05-11 14:25:43 -070031#define NCCL_COMM_NULL NULL
Okwan Kwonc31c8a02023-01-13 16:36:40 -080032
33#define NCCL_UNIQUE_ID_BYTES 128
34typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
35
36/* Error type */
37typedef enum { ncclSuccess = 0,
38 ncclUnhandledCudaError = 1,
39 ncclSystemError = 2,
40 ncclInternalError = 3,
41 ncclInvalidArgument = 4,
42 ncclInvalidUsage = 5,
43 ncclRemoteError = 6,
44 ncclInProgress = 7,
45 ncclNumResults = 8 } ncclResult_t;
46
Ben Vanikdd977b12023-05-11 14:25:43 -070047#define NCCL_CONFIG_UNDEF_INT INT_MIN
48#define NCCL_CONFIG_UNDEF_PTR NULL
49#define NCCL_SPLIT_NOCOLOR -1
50
Okwan Kwonc31c8a02023-01-13 16:36:40 -080051/* Communicator configuration. Users can assign value to attributes to specify the
52 * behavior of a communicator. */
Ben Vanikdd977b12023-05-11 14:25:43 -070053typedef struct ncclConfig_v21700 {
Okwan Kwonc31c8a02023-01-13 16:36:40 -080054 /* attributes that users should never touch. */
55 size_t size;
56 unsigned int magic;
57 unsigned int version;
58 /* attributes that users are able to customize. */
59 int blocking;
Ben Vanikdd977b12023-05-11 14:25:43 -070060 int cgaClusterSize;
61 int minCTAs;
62 int maxCTAs;
63 const char *netName;
64 int splitShare;
Okwan Kwonc31c8a02023-01-13 16:36:40 -080065} ncclConfig_t;
66
67/* Config initializer must be assigned to initialize config structure when it is created.
68 * Not initialized config will result in NCCL error. */
69#define NCCL_CONFIG_INITIALIZER { \
70 sizeof(ncclConfig_t), /* size */ \
71 0xcafebeef, /* magic */ \
72 NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
Ben Vanikdd977b12023-05-11 14:25:43 -070073 NCCL_CONFIG_UNDEF_INT, /* blocking */ \
74 NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
75 NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
76 NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
77 NCCL_CONFIG_UNDEF_PTR, /* netName */ \
78 NCCL_CONFIG_UNDEF_INT /* splitShare */ \
Okwan Kwonc31c8a02023-01-13 16:36:40 -080079}
80
81/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
82 * This integer is coded with the MAJOR, MINOR and PATCH level of the
83 * NCCL library
84 */
85ncclResult_t ncclGetVersion(int *version);
86ncclResult_t pncclGetVersion(int *version);
87
88/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
89 * called once and the Id should be distributed to all ranks in the
90 * communicator before calling ncclCommInitRank. */
91ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
92ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
93
94/* Create a new communicator (multi thread/process version) with a configuration
95 * set by users. */
96ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
97ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
98
99/* Creates a new communicator (multi thread/process version).
100 * rank must be between 0 and nranks-1 and unique within a communicator clique.
101 * Each rank is associated to a CUDA device, which has to be set before calling
102 * ncclCommInitRank.
103 * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
104 * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
105ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
106ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
107
108/* Creates a clique of communicators (single process version).
109 * This is a convenience function to create a single-process communicator clique.
110 * Returns an array of ndev newly initialized communicators in comm.
111 * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
112 * If devlist is NULL, the first ndev CUDA devices are used.
113 * Order of devlist defines user-order of processors within the communicator. */
114ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
115ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
116
117/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
118 * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
119 * when the communicator is globally quiescent and related resources are freed; then,
120 * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
121 * itself) without blocking. */
122ncclResult_t ncclCommFinalize(ncclComm_t comm);
123ncclResult_t pncclCommFinalize(ncclComm_t comm);
124
125/* Frees local resources associated with communicator object. */
126ncclResult_t ncclCommDestroy(ncclComm_t comm);
127ncclResult_t pncclCommDestroy(ncclComm_t comm);
128
129/* Frees resources associated with communicator object and aborts any operations
130 * that might still be running on the device. */
131ncclResult_t ncclCommAbort(ncclComm_t comm);
132ncclResult_t pncclCommAbort(ncclComm_t comm);
133
Ben Vanikdd977b12023-05-11 14:25:43 -0700134/* Creates one or more communicators from an existing one.
135 * Ranks with the same color will end up in the same communicator.
136 * Within the new communicator, key will be used to order ranks.
137 * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
138 * and will therefore return a NULL communicator.
139 * If config is NULL, the new communicator will inherit the original communicator's
140 * configuration*/
141ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
142ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
143
Okwan Kwonc31c8a02023-01-13 16:36:40 -0800144/* Returns a string for each error code. */
145const char* ncclGetErrorString(ncclResult_t result);
146const char* pncclGetErrorString(ncclResult_t result);
147
148/* Returns a human-readable message of the last error that occurred.
149 * comm is currently unused and can be set to NULL
150 */
151const char* ncclGetLastError(ncclComm_t comm);
Ben Vanikdd977b12023-05-11 14:25:43 -0700152const char* pncclGetLastError(ncclComm_t comm);
Okwan Kwonc31c8a02023-01-13 16:36:40 -0800153
154/* Checks whether the comm has encountered any asynchronous errors */
155ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
156ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
157
158/* Gets the number of ranks in the communicator clique. */
159ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
160ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
161
162/* Returns the cuda device number associated with the communicator. */
163ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
164ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
165
166/* Returns the user-ordered "rank" associated with the communicator. */
167ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
168ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
169
170/* Reduction operation selector */
171typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
172typedef enum { ncclSum = 0,
173 ncclProd = 1,
174 ncclMax = 2,
175 ncclMin = 3,
176 ncclAvg = 4,
177 /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
178 * serves as the least possible value for dynamic ncclRedOp_t's
179 * as constructed by ncclRedOpCreate*** functions. */
180 ncclNumOps = 5,
181 /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
182 * It is defined to be the largest signed value (since compilers
183 * are permitted to use signed enums) that won't grow
184 * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
185 * maintain ABI compatibility. */
186 ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
187 } ncclRedOp_t;
188
189/* Data types */
190typedef enum { ncclInt8 = 0, ncclChar = 0,
191 ncclUint8 = 1,
192 ncclInt32 = 2, ncclInt = 2,
193 ncclUint32 = 3,
194 ncclInt64 = 4,
195 ncclUint64 = 5,
196 ncclFloat16 = 6, ncclHalf = 6,
197 ncclFloat32 = 7, ncclFloat = 7,
198 ncclFloat64 = 8, ncclDouble = 8,
199#if defined(__CUDA_BF16_TYPES_EXIST__)
200 ncclBfloat16 = 9,
201 ncclNumTypes = 10
202#else
203 ncclNumTypes = 9
204#endif
205} ncclDataType_t;
206
207/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
208typedef enum {
209 /* ncclScalarDevice: The scalar is in device-visible memory and will be
210 * dereferenced while the collective is running. */
211 ncclScalarDevice = 0,
212
213 /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
214 * dereferenced before the ncclRedOpCreate***() function returns. */
215 ncclScalarHostImmediate = 1
216} ncclScalarResidence_t;
217
218/*
219 * ncclRedOpCreatePreMulSum
220 *
221 * Creates a new reduction operator which pre-multiplies input values by a given
222 * scalar locally before reducing them with peer values via summation. For use
223 * only with collectives launched against *comm* and *datatype*. The
224 * *residence* argument indicates how/when the memory pointed to by *scalar*
225 * will be dereferenced. Upon return, the newly created operator's handle
226 * is stored in *op*.
227 */
228ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
229ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
230
231/*
232 * ncclRedOpDestroy
233 *
234 * Destroys the reduction operator *op*. The operator must have been created by
235 * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
236 * destroyed as soon as the last NCCL function which is given that operator returns.
237 */
238ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
239ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
240
241/*
242 * Collective communication operations
243 *
244 * Collective communication operations must be called separately for each
245 * communicator in a communicator clique.
246 *
247 * They return when operations have been enqueued on the CUDA stream.
248 *
249 * Since they may perform inter-CPU synchronization, each call has to be done
250 * from a different thread or process, or need to use Group Semantics (see
251 * below).
252 */
253
254/*
255 * Reduce
256 *
257 * Reduces data arrays of length count in sendbuff into recvbuff using op
258 * operation.
259 * recvbuff may be NULL on all calls except for root device.
260 * root is the rank (not the CUDA device) where data will reside after the
261 * operation is complete.
262 *
263 * In-place operation will happen if sendbuff == recvbuff.
264 */
265ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
266 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
267ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
268 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
269
270/*
271 * (deprecated) Broadcast (in-place)
272 *
273 * Copies count values from root to all other devices.
274 * root is the rank (not the CUDA device) where data resides before the
275 * operation is started.
276 *
277 * This operation is implicitely in place.
278 */
279ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
280 ncclComm_t comm, cudaStream_t stream);
281ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
282 ncclComm_t comm, cudaStream_t stream);
283
284/*
285 * Broadcast
286 *
287 * Copies count values from root to all other devices.
288 * root is the rank (not the CUDA device) where data resides before the
289 * operation is started.
290 *
291 * In-place operation will happen if sendbuff == recvbuff.
292 */
293ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
294 ncclComm_t comm, cudaStream_t stream);
295ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
296 ncclComm_t comm, cudaStream_t stream);
297
298/*
299 * All-Reduce
300 *
301 * Reduces data arrays of length count in sendbuff using op operation, and
302 * leaves identical copies of result on each recvbuff.
303 *
304 * In-place operation will happen if sendbuff == recvbuff.
305 */
306ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
307 ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
308ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
309 ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
310
311/*
312 * Reduce-Scatter
313 *
314 * Reduces data in sendbuff using op operation and leaves reduced result
315 * scattered over the devices so that recvbuff on rank i will contain the i-th
316 * block of the result.
317 * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
318 * should have a size of at least nranks*recvcount elements.
319 *
320 * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
321 */
322ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
323 size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
324 cudaStream_t stream);
325ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
326 size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
327 cudaStream_t stream);
328
329/*
330 * All-Gather
331 *
332 * Each device gathers sendcount values from other GPUs into recvbuff,
333 * receiving data from rank i at offset i*sendcount.
334 * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
335 * should have a size of at least nranks*sendcount elements.
336 *
337 * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
338 */
339ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
340 ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
341ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
342 ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
343
344/*
345 * Send
346 *
347 * Send data from sendbuff to rank peer.
348 *
349 * Rank peer needs to call ncclRecv with the same datatype and the same count from this
350 * rank.
351 *
352 * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
353 * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
354 * ncclGroupEnd section.
355 */
356ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
357 ncclComm_t comm, cudaStream_t stream);
358ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
359 ncclComm_t comm, cudaStream_t stream);
360
361/*
362 * Receive
363 *
364 * Receive data from rank peer into recvbuff.
365 *
366 * Rank peer needs to call ncclSend with the same datatype and the same count to this
367 * rank.
368 *
369 * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
370 * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
371 * ncclGroupEnd section.
372 */
373ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
374 ncclComm_t comm, cudaStream_t stream);
375ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
376 ncclComm_t comm, cudaStream_t stream);
377
378/*
379 * Group semantics
380 *
381 * When managing multiple GPUs from a single thread, and since NCCL collective
382 * calls may perform inter-CPU synchronization, we need to "group" calls for
383 * different ranks/devices into a single call.
384 *
385 * Grouping NCCL calls as being part of the same collective operation is done
386 * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
387 * collective calls until the ncclGroupEnd call, which will wait for all calls
388 * to be complete. Note that for collective communication, ncclGroupEnd only
389 * guarantees that the operations are enqueued on the streams, not that
390 * the operation is effectively done.
391 *
392 * Both collective communication and ncclCommInitRank can be used in conjunction
393 * of ncclGroupStart/ncclGroupEnd, but not together.
394 *
395 * Group semantics also allow to fuse multiple operations on the same device
396 * to improve performance (for aggregated collective calls), or to permit
397 * concurrent progress of multiple send/receive operations.
398 */
399
400/*
401 * Group Start
402 *
403 * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
404 * a single NCCL operation. Nothing will be started on the CUDA stream until
405 * ncclGroupEnd.
406 */
407ncclResult_t ncclGroupStart();
408ncclResult_t pncclGroupStart();
409
410/*
411 * Group End
412 *
413 * End a group call. Start a fused NCCL operation consisting of all calls since
414 * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
415 * need to be called after ncclGroupEnd.
416 */
417ncclResult_t ncclGroupEnd();
418ncclResult_t pncclGroupEnd();
419
420#ifdef __cplusplus
421} // end extern "C"
422#endif
423
424#endif // end include guard