blob: ad7ee42f87b33413f76f05d70e3fb549ce0730e6 [file] [log] [blame]
Boian Petkantchin4132d2e2024-05-17 08:47:33 -07001/*************************************************************************
2 * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
3 * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
4 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
5 *
6 * See LICENSE.txt for license information
7 ************************************************************************/
8
9#ifndef NCCL_H_
10#define NCCL_H_
11
12#include <hip/hip_runtime.h>
13#include <hip/hip_fp16.h>
14
15#define NCCL_MAJOR 2
16#define NCCL_MINOR 18
17#define NCCL_PATCH 3
18#define NCCL_SUFFIX ""
19
20#define NCCL_VERSION_CODE 21803
21#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
22
23#define RCCL_BFLOAT16 1
24#define RCCL_GATHER_SCATTER 1
25#define RCCL_ALLTOALLV 1
26
27#ifdef __cplusplus
28extern "C" {
29#endif
30
31#include <limits.h>
32
33/*! @brief Opaque handle to communicator
34 @details A communicator contains information required to facilitate collective communications calls */
35typedef struct ncclComm* ncclComm_t;
36#define NCCL_COMM_NULL NULL
37
38#define NCCL_UNIQUE_ID_BYTES 128
39/*! @brief Opaque unique id used to initialize communicators
40 @details The ncclUniqueId must be passed to all participating ranks */
41typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId;
42
43/*! @defgroup rccl_result_code Result Codes
44 @details The various result codes that RCCL API calls may return
45 @{ */
46
47/*! @brief Result type
48 @details Return codes aside from ncclSuccess indicate that a call has failed */
49 typedef enum {
50 ncclSuccess = 0, /*!< No error */
51 ncclUnhandledCudaError = 1, /*!< Unhandled HIP error */
52 ncclSystemError = 2, /*!< Unhandled system error */
53 ncclInternalError = 3, /*!< Internal Error - Please report to RCCL developers */
54 ncclInvalidArgument = 4, /*!< Invalid argument */
55 ncclInvalidUsage = 5, /*!< Invalid usage */
56 ncclRemoteError = 6, /*!< Remote process exited or there was a network error */
57 ncclInProgress = 7, /*!< RCCL operation in progress */
58 ncclNumResults = 8 /*!< Number of result types */
59 } ncclResult_t;
60/*! @} */
61
62#define NCCL_CONFIG_UNDEF_INT INT_MIN
63#define NCCL_CONFIG_UNDEF_PTR NULL
64#define NCCL_SPLIT_NOCOLOR -1
65
66/*! @defgroup rccl_config_type Communicator Configuration
67 @details Structure that allows for customizing Communicator behavior via ncclCommInitRankConfig
68 @{ */
69
70/*! @brief Communicator configuration
71 @details Users can assign value to attributes to specify the behavior of a communicator */
72typedef struct ncclConfig_v21700 {
73 /* attributes that users should never touch. */
74 size_t size; /*!< Should not be touched */
75 unsigned int magic; /*!< Should not be touched */
76 unsigned int version; /*!< Should not be touched */
77 /* attributes that users are able to customize. */
78 int blocking; /*!< Whether or not calls should block or not */
79 int cgaClusterSize; /*!< Cooperative group array cluster size */
80 int minCTAs; /*!< Minimum number of cooperative thread arrays (blocks) */
81 int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */
82 const char *netName; /*!< Force NCCL to use a specfic network */
83 int splitShare; /*!< Allow communicators to share resources */
84} ncclConfig_t;
85
86/* Config initializer must be assigned to initialize config structure when it is created.
87 * Not initialized config will result in an error. */
88#define NCCL_CONFIG_INITIALIZER { \
89 sizeof(ncclConfig_t), /* size */ \
90 0xcafebeef, /* magic */ \
91 NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
92 NCCL_CONFIG_UNDEF_INT, /* blocking */ \
93 NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
94 NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
95 NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
96 NCCL_CONFIG_UNDEF_PTR, /* netName */ \
97 NCCL_CONFIG_UNDEF_INT /* splitShare */ \
98}
99/*! @} */
100
101/*! @defgroup rccl_api_version Version Information
102 @details API call that returns RCCL version
103 @{ */
104
105/*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
106 @details This integer is coded with the MAJOR, MINOR and PATCH level of RCCL.
107 @return Result code. See @ref rccl_result_code for more details.
108
109 @param[out] version Pointer to where version will be stored */
110ncclResult_t ncclGetVersion(int *version);
111/*! @cond include_hidden */
112ncclResult_t pncclGetVersion(int *version);
113/*! @endcond */
114/*! @} */
115
116/*! @defgroup rccl_api_communicator Communicator Initialization/Destruction
117 @details API calls that operate on communicators.
118 Communicators objects are used to launch collective communication
119 operations. Unique ranks between 0 and N-1 must be assigned to
120 each HIP device participating in the same Communicator.
121 Using the same HIP device for multiple ranks of the same Communicator
122 is not supported at this time.
123 @{ */
124
125/*! @brief Generates an ID for ncclCommInitRank.
126 @details Generates an ID to be used in ncclCommInitRank.
127 ncclGetUniqueId should be called once by a single rank and the
128 ID should be distributed to all ranks in the communicator before
129 using it as a parameter for ncclCommInitRank.
130 @return Result code. See @ref rccl_result_code for more details.
131
132 @param[out] uniqueId Pointer to where uniqueId will be stored */
133ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
134/*! @cond include_hidden */
135ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
136/*! @endcond */
137
138/*! @brief Create a new communicator with config.
139 @details Create a new communicator (multi thread/process version) with a configuration
140 set by users. See @ref rccl_config_type for more details.
141 Each rank is associated to a CUDA device, which has to be set before calling
142 ncclCommInitRank.
143 @return Result code. See @ref rccl_result_code for more details.
144
145 @param[out] comm Pointer to created communicator
146 @param[in] nranks Total number of ranks participating in this communicator
147 @param[in] commId UniqueId required for initialization
148 @param[in] rank Current rank to create communicator for. [0 to nranks-1]
149 @param[in] config Pointer to communicator configuration */
150ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
151/*! @cond include_hidden */
152ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
153/*! @endcond */
154
155/*! @brief Creates a new communicator (multi thread/process version).
156 @details Rank must be between 0 and nranks-1 and unique within a communicator clique.
157 Each rank is associated to a CUDA device, which has to be set before calling
158 ncclCommInitRank. ncclCommInitRank implicitly syncronizes with other ranks,
159 so it must be called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
160 @return Result code. See @ref rccl_result_code for more details.
161
162 @param[out] comm Pointer to created communicator
163 @param[in] nranks Total number of ranks participating in this communicator
164 @param[in] commId UniqueId required for initialization
165 @param[in] rank Current rank to create communicator for */
166ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
167/*! @cond include_hidden */
168ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
169/*! @endcond */
170
171/*! @brief Creates a clique of communicators (single process version).
172 @details This is a convenience function to create a single-process communicator clique.
173 Returns an array of ndev newly initialized communicators in comm.
174 comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
175 If devlist is NULL, the first ndev HIP devices are used.
176 Order of devlist defines user-order of processors within the communicator.
177 @return Result code. See @ref rccl_result_code for more details.
178
179 @param[out] comm Pointer to array of created communicators
180 @param[in] ndev Total number of ranks participating in this communicator
181 @param[in] devlist Array of GPU device indices to create for */
182ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
183/*! @cond include_hidden */
184ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
185/*! @endcond */
186
187/*! @brief Finalize a communicator.
188 @details ncclCommFinalize flushes all issued communications
189 and marks communicator state as ncclInProgress. The state will change to ncclSuccess
190 when the communicator is globally quiescent and related resources are freed; then,
191 calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
192 itself) without blocking.
193 @return Result code. See @ref rccl_result_code for more details.
194
195 @param[in] comm Communicator to finalize */
196ncclResult_t ncclCommFinalize(ncclComm_t comm);
197/*! @cond include_hidden */
198ncclResult_t pncclCommFinalize(ncclComm_t comm);
199/*! @endcond */
200
201/*! @brief Frees local resources associated with communicator object.
202 @details Destroy all local resources associated with the passed in communicator object
203 @return Result code. See @ref rccl_result_code for more details.
204
205 @param[in] comm Communicator to destroy */
206ncclResult_t ncclCommDestroy(ncclComm_t comm);
207/*! @cond include_hidden */
208ncclResult_t pncclCommDestroy(ncclComm_t comm);
209/*! @endcond */
210
211/*! @brief Abort any in-progress calls and destroy the communicator object.
212 @details Frees resources associated with communicator object and aborts any operations
213 that might still be running on the device.
214 @return Result code. See @ref rccl_result_code for more details.
215
216 @param[in] comm Communicator to abort and destroy */
217ncclResult_t ncclCommAbort(ncclComm_t comm);
218/*! @cond include_hidden */
219ncclResult_t pncclCommAbort(ncclComm_t comm);
220/*! @endcond */
221
222/*! @brief Create one or more communicators from an existing one.
223 @details Creates one or more communicators from an existing one.
224 Ranks with the same color will end up in the same communicator.
225 Within the new communicator, key will be used to order ranks.
226 NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
227 and will therefore return a NULL communicator.
228 If config is NULL, the new communicator will inherit the original communicator's configuration
229 @return Result code. See @ref rccl_result_code for more details.
230
231 @param[in] comm Original communicator object for this rank
232 @param[in] color Color to assign this rank
233 @param[in] key Key used to order ranks within the same new communicator
234 @param[out] newcomm Pointer to new communicator
235 @param[in] config Config file for new communicator. May be NULL to inherit from comm */
236ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
237/*! @cond include_hidden */
238ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
239/*! @endcond */
240/*! @} */
241
242/*! @defgroup rccl_api_errcheck Error Checking Calls
243 @details API calls that check for errors
244 @{ */
245
246/*! @brief Returns a string for each result code.
247 @details Returns a human-readable string describing the given result code.
248 @return String containing description of result code.
249
250 @param[in] result Result code to get description for */
251const char* ncclGetErrorString(ncclResult_t result);
252/*! @cond include_hidden */
253const char* pncclGetErrorString(ncclResult_t result);
254/*! @endcond */
255
256/*! @brief Returns mesage on last result that occured.
257 @details Returns a human-readable message of the last error that occurred.
258 @return String containing the last result
259
260 @param[in] comm is currently unused and can be set to NULL */
261const char* ncclGetLastError(ncclComm_t comm);
262/*! @cond include_hidden */
263const char* pncclGetLastError(ncclComm_t comm);
264/*! @endcond */
265
266/*! @brief Checks whether the comm has encountered any asynchronous errors
267 @details Query whether the provided communicator has encountered any asynchronous errors
268 @return Result code. See @ref rccl_result_code for more details.
269
270 @param[in] comm Communicator to query
271 @param[out] asyncError Pointer to where result code will be stored */
272ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
273/*! @cond include_hidden */
274ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
275/*! @endcond */
276/*! @} */
277
278/*! @defgroup rccl_api_comminfo Communicator Information
279 @details API calls that query communicator information
280 @{ */
281
282/*! @brief Gets the number of ranks in the communicator clique.
283 @details Returns the number of ranks in the communicator clique (as set during initialization)
284 @return Result code. See @ref rccl_result_code for more details.
285
286 @param[in] comm Communicator to query
287 @param[out] count Pointer to where number of ranks will be stored */
288ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
289/*! @cond include_hidden */
290ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
291/*~ @endcond */
292
293/*! @brief Get the ROCm device index associated with a communicator
294 @details Returns the ROCm device number associated with the provided communicator.
295 @return Result code. See @ref rccl_result_code for more details.
296
297 @param[in] comm Communicator to query
298 @param[out] device Pointer to where the associated ROCm device index will be stored */
299ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
300/*! @cond include_hidden */
301ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
302/*! @endcond */
303
304/*! @brief Get the rank associated with a communicator
305 @details Returns the user-ordered "rank" associated with the provided communicator.
306 @return Result code. See @ref rccl_result_code for more details.
307
308 @param[in] comm Communicator to query
309 @param[out] rank Pointer to where the associated rank will be stored */
310ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
311/*! @cond include_hidden */
312ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
313/*! @endcond */
314/*! @} */
315
316/*! @defgroup rccl_api_enumerations API Enumerations
317 @details Enumerations used by collective communication calls
318 @{ */
319
320/*! @brief Dummy reduction enumeration
321 @details Dummy reduction enumeration used to determine value for ncclMaxRedOp */
322typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
323
324/*! @brief Reduction operation selector
325 @details Enumeration used to specify the various reduction operations
326 ncclNumOps is the number of built-in ncclRedOp_t values and serves as
327 the least possible value for dynamic ncclRedOp_t values constructed by
328 ncclRedOpCreate functions.
329
330 ncclMaxRedOp is the largest valid value for ncclRedOp_t and is defined
331 to be the largest signed value (since compilers are permitted to use
332 signed enums) that won't grow sizeof(ncclRedOp_t) when compared to previous
333 RCCL versions to maintain ABI compatibility. */
334typedef enum { ncclSum = 0, /*!< Sum */
335 ncclProd = 1, /*!< Product */
336 ncclMax = 2, /*!< Max */
337 ncclMin = 3, /*!< Min */
338 ncclAvg = 4, /*!< Average */
339 ncclNumOps = 5, /*!< Number of built-in reduction ops */
340 ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) /*!< Largest value for ncclRedOp_t */
341 } ncclRedOp_t;
342
343/*! @brief Data types
344 @details Enumeration of the various supported datatype */
345typedef enum { ncclInt8 = 0, ncclChar = 0,
346 ncclUint8 = 1,
347 ncclInt32 = 2, ncclInt = 2,
348 ncclUint32 = 3,
349 ncclInt64 = 4,
350 ncclUint64 = 5,
351 ncclFloat16 = 6, ncclHalf = 6,
352 ncclFloat32 = 7, ncclFloat = 7,
353 ncclFloat64 = 8, ncclDouble = 8,
354 ncclBfloat16 = 9,
355 ncclNumTypes = 10 } ncclDataType_t;
356/*! @} */
357
358/*! @defgroup rccl_api_custom_redop Custom Reduction Operator
359 @details API calls relating to creation/destroying custom reduction operator
360 that pre-multiplies local source arrays prior to reduction
361 @{ */
362
363/*! @brief Location and dereferencing logic for scalar arguments.
364 @details Enumeration specifying memory location of the scalar argument.
365 Based on where the value is stored, the argument will be dereferenced either
366 while the collective is running (if in device memory), or before the ncclRedOpCreate()
367 function returns (if in host memory). */
368typedef enum {
369 ncclScalarDevice = 0, /*!< Scalar is in device-visible memory */
370 ncclScalarHostImmediate = 1 /*!< Scalar is in host-visible memory */
371} ncclScalarResidence_t;
372
373/*! @brief Create a custom pre-multiplier reduction operator
374 @details Creates a new reduction operator which pre-multiplies input values by a given
375 scalar locally before reducing them with peer values via summation. For use
376 only with collectives launched against *comm* and *datatype*. The
377 *residence* argument indicates how/when the memory pointed to by *scalar*
378 will be dereferenced. Upon return, the newly created operator's handle
379 is stored in *op*.
380 @return Result code. See @ref rccl_result_code for more details.
381
382 @param[out] op Pointer to where newly created custom reduction operator is to be stored
383 @param[in] scalar Pointer to scalar value.
384 @param[in] datatype Scalar value datatype
385 @param[in] residence Memory type of the scalar value
386 @param[in] comm Communicator to associate with this custom reduction operator */
387ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
388/*! @cond include_hidden */
389ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
390/*! @endcond */
391
392/*! @brief Destroy custom reduction operator
393 @details Destroys the reduction operator *op*. The operator must have been created by
394 ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
395 destroyed as soon as the last RCCL function which is given that operator returns.
396 @return Result code. See @ref rccl_result_code for more details.
397
398 @param[in] op Custom reduction operator is to be destroyed
399 @param[in] comm Communicator associated with this reduction operator */
400ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
401/*! @cond include_hidden */
402ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
403/*! @endcond */
404/*! @} */
405
406/*! @defgroup rccl_collective_api Collective Communication Operations
407 @details Collective communication operations must be called separately for each
408 communicator in a communicator clique.
409
410 They return when operations have been enqueued on the HIP stream.
411 Since they may perform inter-CPU synchronization, each call has to be done
412 from a different thread or process, or need to use Group Semantics (see
413 below).
414 @{ */
415
416/*! @brief Reduce
417 @details Reduces data arrays of length *count* in *sendbuff* into *recvbuff* using *op*
418 operation.
419 *recvbuff* may be NULL on all calls except for root device.
420 *root* is the rank (not the HIP device) where data will reside after the
421 operation is complete.
422 In-place operation will happen if sendbuff == recvbuff.
423 @return Result code. See @ref rccl_result_code for more details.
424
425 @param[in] sendbuff Local device data buffer to be reduced
426 @param[out] recvbuff Data buffer where result is stored (only for *root* rank). May be null for other ranks.
427 @param[in] count Number of elements in every send buffer
428 @param[in] datatype Data buffer element datatype
429 @param[in] op Reduction operator type
430 @param[in] root Rank where result data array will be stored
431 @param[in] comm Communicator group object to execute on
432 @param[in] stream HIP stream to execute collective on */
433ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
434 ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
435/*! @cond include_hidden */
436ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
437 ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
438/*! @endcond */
439
440/*! @brief (Deprecated) Broadcast (in-place)
441 @details Copies *count* values from *root* to all other devices.
442 root is the rank (not the CUDA device) where data resides before the
443 operation is started.
444 This operation is implicitly in-place.
445 @return Result code. See @ref rccl_result_code for more details.
446
447 @param[in,out] buff Input array on *root* to be copied to other ranks. Output array for all ranks.
448 @param[in] count Number of elements in data buffer
449 @param[in] datatype Data buffer element datatype
450 @param[in] root Rank owning buffer to be copied to others
451 @param[in] comm Communicator group object to execute on
452 @param[in] stream HIP stream to execute collective on */
453ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
454 ncclComm_t comm, hipStream_t stream);
455/*! @cond include_hidden */
456ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
457 ncclComm_t comm, hipStream_t stream);
458/*! @endcond */
459
460/*! @brief Broadcast
461 @details Copies *count* values from *sendbuff* on *root* to *recvbuff* on all devices.
462 *root* is the rank (not the HIP device) where data resides before the operation is started.
463 *sendbuff* may be NULL on ranks other than *root*.
464 In-place operation will happen if *sendbuff* == *recvbuff*.
465 @return Result code. See @ref rccl_result_code for more details.
466
467 @param[in] sendbuff Data array to copy (if *root*). May be NULL for other ranks
468 @param[in] recvbuff Data array to store received array
469 @param[in] count Number of elements in data buffer
470 @param[in] datatype Data buffer element datatype
471 @param[in] root Rank of broadcast root
472 @param[in] comm Communicator group object to execute on
473 @param[in] stream HIP stream to execute collective on */
474ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
475 ncclComm_t comm, hipStream_t stream);
476/*! @cond include_hidden */
477ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
478 ncclComm_t comm, hipStream_t stream);
479/*! @endcond */
480
481/*! @brief All-Reduce
482 @details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and
483 leaves identical copies of result on each *recvbuff*.
484 In-place operation will happen if sendbuff == recvbuff.
485 @return Result code. See @ref rccl_result_code for more details.
486
487 @param[in] sendbuff Input data array to reduce
488 @param[out] recvbuff Data array to store reduced result array
489 @param[in] count Number of elements in data buffer
490 @param[in] datatype Data buffer element datatype
491 @param[in] op Reduction operator
492 @param[in] comm Communicator group object to execute on
493 @param[in] stream HIP stream to execute collective on */
494ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
495 ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
496/*! @cond include_hidden */
497ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
498 ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
499/*! @endcond */
500
501/*! @brief Reduce-Scatter
502 @details Reduces data in *sendbuff* using *op* operation and leaves reduced result
503 scattered over the devices so that *recvbuff* on rank i will contain the i-th
504 block of the result.
505 Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
506 should have a size of at least nranks*recvcount elements.
507 In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
508 @return Result code. See @ref rccl_result_code for more details.
509
510 @param[in] sendbuff Input data array to reduce
511 @param[out] recvbuff Data array to store reduced result subarray
512 @param[in] recvcount Number of elements each rank receives
513 @param[in] datatype Data buffer element datatype
514 @param[in] op Reduction operator
515 @param[in] comm Communicator group object to execute on
516 @param[in] stream HIP stream to execute collective on */
517ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
518 size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
519 hipStream_t stream);
520/*! @cond include_hidden */
521ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
522 size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
523 hipStream_t stream);
524/*! @endcond */
525
526/*! @brief All-Gather
527 @details Each device gathers *sendcount* values from other GPUs into *recvbuff*,
528 receiving data from rank i at offset i*sendcount.
529 Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
530 should have a size of at least nranks*sendcount elements.
531 In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
532 @return Result code. See @ref rccl_result_code for more details.
533
534 @param[in] sendbuff Input data array to send
535 @param[out] recvbuff Data array to store the gathered result
536 @param[in] sendcount Number of elements each rank sends
537 @param[in] datatype Data buffer element datatype
538 @param[in] comm Communicator group object to execute on
539 @param[in] stream HIP stream to execute collective on */
540ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
541 ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
542/*! @cond include_hidden */
543ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
544 ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
545/*! @endcond */
546
547/*! @brief Send
548 @details Send data from *sendbuff* to rank *peer*.
549 Rank *peer* needs to call ncclRecv with the same *datatype* and the same *count*
550 as this rank.
551 This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
552 need to progress concurrently to complete, they must be fused within a ncclGroupStart /
553 ncclGroupEnd section.
554 @return Result code. See @ref rccl_result_code for more details.
555
556 @param[in] sendbuff Data array to send
557 @param[in] count Number of elements to send
558 @param[in] datatype Data buffer element datatype
559 @param[in] peer Peer rank to send to
560 @param[in] comm Communicator group object to execute on
561 @param[in] stream HIP stream to execute collective on */
562ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
563 ncclComm_t comm, hipStream_t stream);
564/*! @cond include_hidden */
565ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
566 ncclComm_t comm, hipStream_t stream);
567/*! @endcond */
568
569/*! @brief Receive
570 @details Receive data from rank *peer* into *recvbuff*.
571 Rank *peer* needs to call ncclSend with the same datatype and the same count
572 as this rank.
573 This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
574 need to progress concurrently to complete, they must be fused within a ncclGroupStart/
575 ncclGroupEnd section.
576 @return Result code. See @ref rccl_result_code for more details.
577
578 @param[out] recvbuff Data array to receive
579 @param[in] count Number of elements to receive
580 @param[in] datatype Data buffer element datatype
581 @param[in] peer Peer rank to send to
582 @param[in] comm Communicator group object to execute on
583 @param[in] stream HIP stream to execute collective on */
584ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
585 ncclComm_t comm, hipStream_t stream);
586/*! @cond include_hidden */
587ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
588 ncclComm_t comm, hipStream_t stream);
589/*! @endcond */
590
591/*! @brief Gather
592 @details Root device gathers *sendcount* values from other GPUs into *recvbuff*,
593 receiving data from rank i at offset i*sendcount.
594 Assumes recvcount is equal to nranks*sendcount, which means that *recvbuff*
595 should have a size of at least nranks*sendcount elements.
596 In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
597 *recvbuff* may be NULL on ranks other than *root*.
598 @return Result code. See @ref rccl_result_code for more details.
599
600 @param[in] sendbuff Data array to send
601 @param[out] recvbuff Data array to receive into on *root*.
602 @param[in] sendcount Number of elements to send per rank
603 @param[in] datatype Data buffer element datatype
604 @param[in] root Rank that receives data from all other ranks
605 @param[in] comm Communicator group object to execute on
606 @param[in] stream HIP stream to execute collective on */
607ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
608 ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
609/*! @cond include_hidden */
610ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
611 ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
612/*! @endcond */
613
614/*! @brief Scatter
615 @details Scattered over the devices so that recvbuff on rank i will contain the i-th
616 block of the data on root.
617 Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
618 should have a size of at least nranks*recvcount elements.
619 In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
620 @return Result code. See @ref rccl_result_code for more details.
621
622 @param[in] sendbuff Data array to send (on *root* rank). May be NULL on other ranks.
623 @param[out] recvbuff Data array to receive partial subarray into
624 @param[in] recvcount Number of elements to receive per rank
625 @param[in] datatype Data buffer element datatype
626 @param[in] root Rank that scatters data to all other ranks
627 @param[in] comm Communicator group object to execute on
628 @param[in] stream HIP stream to execute collective on */
629ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff,
630 size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
631 hipStream_t stream);
632/*! @cond include_hidden */
633ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff,
634 size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
635 hipStream_t stream);
636/*! @endcond */
637
638/*! @brief All-To-All
639 @details Device (i) send (j)th block of data to device (j) and be placed as (i)th
640 block. Each block for sending/receiving has *count* elements, which means
641 that *recvbuff* and *sendbuff* should have a size of nranks*count elements.
642 In-place operation is NOT supported. It is the user's responsibility
643 to ensure that sendbuff and recvbuff are distinct.
644 @return Result code. See @ref rccl_result_code for more details.
645
646 @param[in] sendbuff Data array to send (contains blocks for each other rank)
647 @param[out] recvbuff Data array to receive (contains blocks from each other rank)
648 @param[in] count Number of elements to send between each pair of ranks
649 @param[in] datatype Data buffer element datatype
650 @param[in] comm Communicator group object to execute on
651 @param[in] stream HIP stream to execute collective on */
652ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
653 ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
654/*! @cond include_hidden */
655ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
656 ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
657/*! @endcond */
658
659/*! @brief All-To-Allv
660 @details Device (i) sends sendcounts[j] of data from offset sdispls[j]
661 to device (j). At the same time, device (i) receives recvcounts[j] of data
662 from device (j) to be placed at rdispls[j].
663 sendcounts, sdispls, recvcounts and rdispls are all measured in the units
664 of datatype, not bytes.
665 In-place operation will happen if sendbuff == recvbuff.
666 @return Result code. See @ref rccl_result_code for more details.
667
668 @param[in] sendbuff Data array to send (contains blocks for each other rank)
669 @param[in] sendcounts Array containing number of elements to send to each participating rank
670 @param[in] sdispls Array of offsets into *sendbuff* for each participating rank
671 @param[out] recvbuff Data array to receive (contains blocks from each other rank)
672 @param[in] recvcounts Array containing number of elements to receive from each participating rank
673 @param[in] rdispls Array of offsets into *recvbuff* for each participating rank
674 @param[in] datatype Data buffer element datatype
675 @param[in] comm Communicator group object to execute on
676 @param[in] stream HIP stream to execute collective on */
677ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
678 const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
679 const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
680/*! @cond include_hidden */
681ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
682 const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
683 const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
684/*! @endcond */
685
686/*! @} */
687
688/*! @defgroup msccl_api MSCCL Algorithm
689 @details API calls relating to the optional MSCCL algorithm datapath
690 @{ */
691
692/*! @brief Opaque handle to MSCCL algorithm */
693typedef int mscclAlgoHandle_t;
694
695/*! @brief MSCCL Load Algorithm
696 @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
697 its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
698 scheduler instead of end users.
699 @return Result code. See @ref rccl_result_code for more details.
700
701 @param[in] mscclAlgoFilePath Path to MSCCL algorithm file
702 @param[out] mscclAlgoHandle Returned handle to MSCCL algorithm
703 @param[in] rank Current rank */
704ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
705/*! @cond include_hidden */
706ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
707/*! @endcond */
708
709/*! @brief MSCCL Run Algorithm
710 @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
711 list merges all possible parameters required by different operations as this
712 is a general-purposed API. This API is expected to be called by MSCCL
713 scheduler instead of end users.
714 @return Result code. See @ref rccl_result_code for more details.
715
716 @param[in] sendBuff Data array to send
717 @param[in] sendCounts Array containing number of elements to send to each participating rank
718 @param[in] sDisPls Array of offsets into *sendbuff* for each participating rank
719 @param[out] recvBuff Data array to receive
720 @param[in] recvCounts Array containing number of elements to receive from each participating rank
721 @param[in] rDisPls Array of offsets into *recvbuff* for each participating rank
722 @param[in] count Number of elements
723 @param[in] dataType Data buffer element datatype
724 @param[in] root Root rank index
725 @param[in] peer Peer rank index
726 @param[in] op Reduction operator
727 @param[in] mscclAlgoHandle Handle to MSCCL algorithm
728 @param[in] comm Communicator group object to execute on
729 @param[in] stream HIP stream to execute collective on */
730ncclResult_t mscclRunAlgo(
731 const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
732 void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
733 size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
734 mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
735/*! @cond include_hidden */
736ncclResult_t pmscclRunAlgo(
737 const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
738 void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
739 size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
740 mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
741/*! @endcond */
742
743/*! @brief MSCCL Unload Algorithm
744 @details Unload MSCCL algorithm previous loaded using its handle. This API
745 is expected to be called by MSCCL scheduler instead of end users.
746 @return Result code. See @ref rccl_result_code for more details.
747
748 @param[in] mscclAlgoHandle Handle to MSCCL algorithm to unload
749*/
750ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
751/*! @cond include_hidden */
752ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
753/*! @endcond */
754/*! @} */
755
756
757/*! @defgroup rccl_group_api Group semantics
758 @details When managing multiple GPUs from a single thread, and since RCCL collective
759 calls may perform inter-CPU synchronization, we need to "group" calls for
760 different ranks/devices into a single call.
761
762 Grouping RCCL calls as being part of the same collective operation is done
763 using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
764 collective calls until the ncclGroupEnd call, which will wait for all calls
765 to be complete. Note that for collective communication, ncclGroupEnd only
766 guarantees that the operations are enqueued on the streams, not that
767 the operation is effectively done.
768
769 Both collective communication and ncclCommInitRank can be used in conjunction
770 of ncclGroupStart/ncclGroupEnd, but not together.
771
772 Group semantics also allow to fuse multiple operations on the same device
773 to improve performance (for aggregated collective calls), or to permit
774 concurrent progress of multiple send/receive operations.
775 @{ */
776
777/*! @brief Group Start
778 @details Start a group call. All calls to RCCL until ncclGroupEnd will be fused into
779 a single RCCL operation. Nothing will be started on the HIP stream until
780 ncclGroupEnd.
781 @return Result code. See @ref rccl_result_code for more details. */
782ncclResult_t ncclGroupStart();
783/*! @cond include_hidden */
784ncclResult_t pncclGroupStart();
785/*! @endcond */
786
787/*! @brief Group End
788 @details End a group call. Start a fused RCCL operation consisting of all calls since
789 ncclGroupStart. Operations on the HIP stream depending on the RCCL operations
790 need to be called after ncclGroupEnd.
791 @return Result code. See @ref rccl_result_code for more details. */
792ncclResult_t ncclGroupEnd();
793/*! @cond include_hidden */
794ncclResult_t pncclGroupEnd();
795/*! @endcond */
796/*! @} */
797
798#ifdef __cplusplus
799} // end extern "C"
800#endif
801
802#endif // end include guard