Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 1 | /************************************************************************* |
| 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. |
| 3 | * |
| 4 | * See LICENSE.txt for license information |
| 5 | ************************************************************************/ |
| 6 | |
| 7 | #ifndef NCCL_H_ |
| 8 | #define NCCL_H_ |
| 9 | |
| 10 | #include <cuda_runtime.h> |
| 11 | #include <cuda_fp16.h> |
| 12 | #if CUDART_VERSION >= 11000 |
| 13 | #include <cuda_bf16.h> |
| 14 | #endif |
| 15 | |
| 16 | #define NCCL_MAJOR 2 |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 17 | #define NCCL_MINOR 18 |
| 18 | #define NCCL_PATCH 1 |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 19 | #define NCCL_SUFFIX "" |
| 20 | |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 21 | #define NCCL_VERSION_CODE 21801 |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 22 | #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) |
| 23 | |
| 24 | #ifdef __cplusplus |
| 25 | extern "C" { |
| 26 | #endif |
| 27 | |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 28 | #include <limits.h> |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 29 | /* Opaque handle to communicator */ |
| 30 | typedef struct ncclComm* ncclComm_t; |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 31 | #define NCCL_COMM_NULL NULL |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 32 | |
| 33 | #define NCCL_UNIQUE_ID_BYTES 128 |
| 34 | typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; |
| 35 | |
| 36 | /* Error type */ |
| 37 | typedef enum { ncclSuccess = 0, |
| 38 | ncclUnhandledCudaError = 1, |
| 39 | ncclSystemError = 2, |
| 40 | ncclInternalError = 3, |
| 41 | ncclInvalidArgument = 4, |
| 42 | ncclInvalidUsage = 5, |
| 43 | ncclRemoteError = 6, |
| 44 | ncclInProgress = 7, |
| 45 | ncclNumResults = 8 } ncclResult_t; |
| 46 | |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 47 | #define NCCL_CONFIG_UNDEF_INT INT_MIN |
| 48 | #define NCCL_CONFIG_UNDEF_PTR NULL |
| 49 | #define NCCL_SPLIT_NOCOLOR -1 |
| 50 | |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 51 | /* Communicator configuration. Users can assign value to attributes to specify the |
| 52 | * behavior of a communicator. */ |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 53 | typedef struct ncclConfig_v21700 { |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 54 | /* attributes that users should never touch. */ |
| 55 | size_t size; |
| 56 | unsigned int magic; |
| 57 | unsigned int version; |
| 58 | /* attributes that users are able to customize. */ |
| 59 | int blocking; |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 60 | int cgaClusterSize; |
| 61 | int minCTAs; |
| 62 | int maxCTAs; |
| 63 | const char *netName; |
| 64 | int splitShare; |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 65 | } ncclConfig_t; |
| 66 | |
| 67 | /* Config initializer must be assigned to initialize config structure when it is created. |
| 68 | * Not initialized config will result in NCCL error. */ |
| 69 | #define NCCL_CONFIG_INITIALIZER { \ |
| 70 | sizeof(ncclConfig_t), /* size */ \ |
| 71 | 0xcafebeef, /* magic */ \ |
| 72 | NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 73 | NCCL_CONFIG_UNDEF_INT, /* blocking */ \ |
| 74 | NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ |
| 75 | NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ |
| 76 | NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ |
| 77 | NCCL_CONFIG_UNDEF_PTR, /* netName */ \ |
| 78 | NCCL_CONFIG_UNDEF_INT /* splitShare */ \ |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 79 | } |
| 80 | |
| 81 | /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. |
| 82 | * This integer is coded with the MAJOR, MINOR and PATCH level of the |
| 83 | * NCCL library |
| 84 | */ |
| 85 | ncclResult_t ncclGetVersion(int *version); |
| 86 | ncclResult_t pncclGetVersion(int *version); |
| 87 | |
| 88 | /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be |
| 89 | * called once and the Id should be distributed to all ranks in the |
| 90 | * communicator before calling ncclCommInitRank. */ |
| 91 | ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); |
| 92 | ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); |
| 93 | |
| 94 | /* Create a new communicator (multi thread/process version) with a configuration |
| 95 | * set by users. */ |
| 96 | ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); |
| 97 | ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); |
| 98 | |
| 99 | /* Creates a new communicator (multi thread/process version). |
| 100 | * rank must be between 0 and nranks-1 and unique within a communicator clique. |
| 101 | * Each rank is associated to a CUDA device, which has to be set before calling |
| 102 | * ncclCommInitRank. |
| 103 | * ncclCommInitRank implicitly syncronizes with other ranks, so it must be |
| 104 | * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ |
| 105 | ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); |
| 106 | ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); |
| 107 | |
| 108 | /* Creates a clique of communicators (single process version). |
| 109 | * This is a convenience function to create a single-process communicator clique. |
| 110 | * Returns an array of ndev newly initialized communicators in comm. |
| 111 | * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). |
| 112 | * If devlist is NULL, the first ndev CUDA devices are used. |
| 113 | * Order of devlist defines user-order of processors within the communicator. */ |
| 114 | ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); |
| 115 | ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); |
| 116 | |
| 117 | /* Finalize a communicator. ncclCommFinalize flushes all issued communications, |
| 118 | * and marks communicator state as ncclInProgress. The state will change to ncclSuccess |
| 119 | * when the communicator is globally quiescent and related resources are freed; then, |
| 120 | * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator |
| 121 | * itself) without blocking. */ |
| 122 | ncclResult_t ncclCommFinalize(ncclComm_t comm); |
| 123 | ncclResult_t pncclCommFinalize(ncclComm_t comm); |
| 124 | |
| 125 | /* Frees local resources associated with communicator object. */ |
| 126 | ncclResult_t ncclCommDestroy(ncclComm_t comm); |
| 127 | ncclResult_t pncclCommDestroy(ncclComm_t comm); |
| 128 | |
| 129 | /* Frees resources associated with communicator object and aborts any operations |
| 130 | * that might still be running on the device. */ |
| 131 | ncclResult_t ncclCommAbort(ncclComm_t comm); |
| 132 | ncclResult_t pncclCommAbort(ncclComm_t comm); |
| 133 | |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 134 | /* Creates one or more communicators from an existing one. |
| 135 | * Ranks with the same color will end up in the same communicator. |
| 136 | * Within the new communicator, key will be used to order ranks. |
| 137 | * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group |
| 138 | * and will therefore return a NULL communicator. |
| 139 | * If config is NULL, the new communicator will inherit the original communicator's |
| 140 | * configuration*/ |
| 141 | ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); |
| 142 | ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); |
| 143 | |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 144 | /* Returns a string for each error code. */ |
| 145 | const char* ncclGetErrorString(ncclResult_t result); |
| 146 | const char* pncclGetErrorString(ncclResult_t result); |
| 147 | |
| 148 | /* Returns a human-readable message of the last error that occurred. |
| 149 | * comm is currently unused and can be set to NULL |
| 150 | */ |
| 151 | const char* ncclGetLastError(ncclComm_t comm); |
Ben Vanik | dd977b1 | 2023-05-11 14:25:43 -0700 | [diff] [blame] | 152 | const char* pncclGetLastError(ncclComm_t comm); |
Okwan Kwon | c31c8a0 | 2023-01-13 16:36:40 -0800 | [diff] [blame] | 153 | |
| 154 | /* Checks whether the comm has encountered any asynchronous errors */ |
| 155 | ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); |
| 156 | ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); |
| 157 | |
| 158 | /* Gets the number of ranks in the communicator clique. */ |
| 159 | ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); |
| 160 | ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); |
| 161 | |
| 162 | /* Returns the cuda device number associated with the communicator. */ |
| 163 | ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); |
| 164 | ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); |
| 165 | |
| 166 | /* Returns the user-ordered "rank" associated with the communicator. */ |
| 167 | ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); |
| 168 | ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); |
| 169 | |
| 170 | /* Reduction operation selector */ |
| 171 | typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; |
| 172 | typedef enum { ncclSum = 0, |
| 173 | ncclProd = 1, |
| 174 | ncclMax = 2, |
| 175 | ncclMin = 3, |
| 176 | ncclAvg = 4, |
| 177 | /* ncclNumOps: The number of built-in ncclRedOp_t values. Also |
| 178 | * serves as the least possible value for dynamic ncclRedOp_t's |
| 179 | * as constructed by ncclRedOpCreate*** functions. */ |
| 180 | ncclNumOps = 5, |
| 181 | /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. |
| 182 | * It is defined to be the largest signed value (since compilers |
| 183 | * are permitted to use signed enums) that won't grow |
| 184 | * sizeof(ncclRedOp_t) when compared to previous NCCL versions to |
| 185 | * maintain ABI compatibility. */ |
| 186 | ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) |
| 187 | } ncclRedOp_t; |
| 188 | |
| 189 | /* Data types */ |
| 190 | typedef enum { ncclInt8 = 0, ncclChar = 0, |
| 191 | ncclUint8 = 1, |
| 192 | ncclInt32 = 2, ncclInt = 2, |
| 193 | ncclUint32 = 3, |
| 194 | ncclInt64 = 4, |
| 195 | ncclUint64 = 5, |
| 196 | ncclFloat16 = 6, ncclHalf = 6, |
| 197 | ncclFloat32 = 7, ncclFloat = 7, |
| 198 | ncclFloat64 = 8, ncclDouble = 8, |
| 199 | #if defined(__CUDA_BF16_TYPES_EXIST__) |
| 200 | ncclBfloat16 = 9, |
| 201 | ncclNumTypes = 10 |
| 202 | #else |
| 203 | ncclNumTypes = 9 |
| 204 | #endif |
| 205 | } ncclDataType_t; |
| 206 | |
| 207 | /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ |
| 208 | typedef enum { |
| 209 | /* ncclScalarDevice: The scalar is in device-visible memory and will be |
| 210 | * dereferenced while the collective is running. */ |
| 211 | ncclScalarDevice = 0, |
| 212 | |
| 213 | /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be |
| 214 | * dereferenced before the ncclRedOpCreate***() function returns. */ |
| 215 | ncclScalarHostImmediate = 1 |
| 216 | } ncclScalarResidence_t; |
| 217 | |
| 218 | /* |
| 219 | * ncclRedOpCreatePreMulSum |
| 220 | * |
| 221 | * Creates a new reduction operator which pre-multiplies input values by a given |
| 222 | * scalar locally before reducing them with peer values via summation. For use |
| 223 | * only with collectives launched against *comm* and *datatype*. The |
| 224 | * *residence* argument indicates how/when the memory pointed to by *scalar* |
| 225 | * will be dereferenced. Upon return, the newly created operator's handle |
| 226 | * is stored in *op*. |
| 227 | */ |
| 228 | ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); |
| 229 | ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); |
| 230 | |
| 231 | /* |
| 232 | * ncclRedOpDestroy |
| 233 | * |
| 234 | * Destroys the reduction operator *op*. The operator must have been created by |
| 235 | * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be |
| 236 | * destroyed as soon as the last NCCL function which is given that operator returns. |
| 237 | */ |
| 238 | ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); |
| 239 | ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); |
| 240 | |
| 241 | /* |
| 242 | * Collective communication operations |
| 243 | * |
| 244 | * Collective communication operations must be called separately for each |
| 245 | * communicator in a communicator clique. |
| 246 | * |
| 247 | * They return when operations have been enqueued on the CUDA stream. |
| 248 | * |
| 249 | * Since they may perform inter-CPU synchronization, each call has to be done |
| 250 | * from a different thread or process, or need to use Group Semantics (see |
| 251 | * below). |
| 252 | */ |
| 253 | |
| 254 | /* |
| 255 | * Reduce |
| 256 | * |
| 257 | * Reduces data arrays of length count in sendbuff into recvbuff using op |
| 258 | * operation. |
| 259 | * recvbuff may be NULL on all calls except for root device. |
| 260 | * root is the rank (not the CUDA device) where data will reside after the |
| 261 | * operation is complete. |
| 262 | * |
| 263 | * In-place operation will happen if sendbuff == recvbuff. |
| 264 | */ |
| 265 | ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, |
| 266 | ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); |
| 267 | ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, |
| 268 | ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); |
| 269 | |
| 270 | /* |
| 271 | * (deprecated) Broadcast (in-place) |
| 272 | * |
| 273 | * Copies count values from root to all other devices. |
| 274 | * root is the rank (not the CUDA device) where data resides before the |
| 275 | * operation is started. |
| 276 | * |
| 277 | * This operation is implicitely in place. |
| 278 | */ |
| 279 | ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, |
| 280 | ncclComm_t comm, cudaStream_t stream); |
| 281 | ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, |
| 282 | ncclComm_t comm, cudaStream_t stream); |
| 283 | |
| 284 | /* |
| 285 | * Broadcast |
| 286 | * |
| 287 | * Copies count values from root to all other devices. |
| 288 | * root is the rank (not the CUDA device) where data resides before the |
| 289 | * operation is started. |
| 290 | * |
| 291 | * In-place operation will happen if sendbuff == recvbuff. |
| 292 | */ |
| 293 | ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, |
| 294 | ncclComm_t comm, cudaStream_t stream); |
| 295 | ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, |
| 296 | ncclComm_t comm, cudaStream_t stream); |
| 297 | |
| 298 | /* |
| 299 | * All-Reduce |
| 300 | * |
| 301 | * Reduces data arrays of length count in sendbuff using op operation, and |
| 302 | * leaves identical copies of result on each recvbuff. |
| 303 | * |
| 304 | * In-place operation will happen if sendbuff == recvbuff. |
| 305 | */ |
| 306 | ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, |
| 307 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); |
| 308 | ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, |
| 309 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); |
| 310 | |
| 311 | /* |
| 312 | * Reduce-Scatter |
| 313 | * |
| 314 | * Reduces data in sendbuff using op operation and leaves reduced result |
| 315 | * scattered over the devices so that recvbuff on rank i will contain the i-th |
| 316 | * block of the result. |
| 317 | * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff |
| 318 | * should have a size of at least nranks*recvcount elements. |
| 319 | * |
| 320 | * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. |
| 321 | */ |
| 322 | ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, |
| 323 | size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, |
| 324 | cudaStream_t stream); |
| 325 | ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, |
| 326 | size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, |
| 327 | cudaStream_t stream); |
| 328 | |
| 329 | /* |
| 330 | * All-Gather |
| 331 | * |
| 332 | * Each device gathers sendcount values from other GPUs into recvbuff, |
| 333 | * receiving data from rank i at offset i*sendcount. |
| 334 | * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff |
| 335 | * should have a size of at least nranks*sendcount elements. |
| 336 | * |
| 337 | * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. |
| 338 | */ |
| 339 | ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, |
| 340 | ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); |
| 341 | ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, |
| 342 | ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); |
| 343 | |
| 344 | /* |
| 345 | * Send |
| 346 | * |
| 347 | * Send data from sendbuff to rank peer. |
| 348 | * |
| 349 | * Rank peer needs to call ncclRecv with the same datatype and the same count from this |
| 350 | * rank. |
| 351 | * |
| 352 | * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations |
| 353 | * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ |
| 354 | * ncclGroupEnd section. |
| 355 | */ |
| 356 | ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, |
| 357 | ncclComm_t comm, cudaStream_t stream); |
| 358 | ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, |
| 359 | ncclComm_t comm, cudaStream_t stream); |
| 360 | |
| 361 | /* |
| 362 | * Receive |
| 363 | * |
| 364 | * Receive data from rank peer into recvbuff. |
| 365 | * |
| 366 | * Rank peer needs to call ncclSend with the same datatype and the same count to this |
| 367 | * rank. |
| 368 | * |
| 369 | * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations |
| 370 | * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ |
| 371 | * ncclGroupEnd section. |
| 372 | */ |
| 373 | ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, |
| 374 | ncclComm_t comm, cudaStream_t stream); |
| 375 | ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, |
| 376 | ncclComm_t comm, cudaStream_t stream); |
| 377 | |
| 378 | /* |
| 379 | * Group semantics |
| 380 | * |
| 381 | * When managing multiple GPUs from a single thread, and since NCCL collective |
| 382 | * calls may perform inter-CPU synchronization, we need to "group" calls for |
| 383 | * different ranks/devices into a single call. |
| 384 | * |
| 385 | * Grouping NCCL calls as being part of the same collective operation is done |
| 386 | * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all |
| 387 | * collective calls until the ncclGroupEnd call, which will wait for all calls |
| 388 | * to be complete. Note that for collective communication, ncclGroupEnd only |
| 389 | * guarantees that the operations are enqueued on the streams, not that |
| 390 | * the operation is effectively done. |
| 391 | * |
| 392 | * Both collective communication and ncclCommInitRank can be used in conjunction |
| 393 | * of ncclGroupStart/ncclGroupEnd, but not together. |
| 394 | * |
| 395 | * Group semantics also allow to fuse multiple operations on the same device |
| 396 | * to improve performance (for aggregated collective calls), or to permit |
| 397 | * concurrent progress of multiple send/receive operations. |
| 398 | */ |
| 399 | |
| 400 | /* |
| 401 | * Group Start |
| 402 | * |
| 403 | * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into |
| 404 | * a single NCCL operation. Nothing will be started on the CUDA stream until |
| 405 | * ncclGroupEnd. |
| 406 | */ |
| 407 | ncclResult_t ncclGroupStart(); |
| 408 | ncclResult_t pncclGroupStart(); |
| 409 | |
| 410 | /* |
| 411 | * Group End |
| 412 | * |
| 413 | * End a group call. Start a fused NCCL operation consisting of all calls since |
| 414 | * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations |
| 415 | * need to be called after ncclGroupEnd. |
| 416 | */ |
| 417 | ncclResult_t ncclGroupEnd(); |
| 418 | ncclResult_t pncclGroupEnd(); |
| 419 | |
| 420 | #ifdef __cplusplus |
| 421 | } // end extern "C" |
| 422 | #endif |
| 423 | |
| 424 | #endif // end include guard |