Boian Petkantchin | 4132d2e | 2024-05-17 08:47:33 -0700 | [diff] [blame] | 1 | /************************************************************************* |
| 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. |
| 3 | * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. |
| 4 | * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. |
| 5 | * |
| 6 | * See LICENSE.txt for license information |
| 7 | ************************************************************************/ |
| 8 | |
| 9 | #ifndef NCCL_H_ |
| 10 | #define NCCL_H_ |
| 11 | |
| 12 | #include <hip/hip_runtime.h> |
| 13 | #include <hip/hip_fp16.h> |
| 14 | |
| 15 | #define NCCL_MAJOR 2 |
| 16 | #define NCCL_MINOR 18 |
| 17 | #define NCCL_PATCH 3 |
| 18 | #define NCCL_SUFFIX "" |
| 19 | |
| 20 | #define NCCL_VERSION_CODE 21803 |
| 21 | #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) |
| 22 | |
| 23 | #define RCCL_BFLOAT16 1 |
| 24 | #define RCCL_GATHER_SCATTER 1 |
| 25 | #define RCCL_ALLTOALLV 1 |
| 26 | |
| 27 | #ifdef __cplusplus |
| 28 | extern "C" { |
| 29 | #endif |
| 30 | |
| 31 | #include <limits.h> |
| 32 | |
| 33 | /*! @brief Opaque handle to communicator |
| 34 | @details A communicator contains information required to facilitate collective communications calls */ |
| 35 | typedef struct ncclComm* ncclComm_t; |
| 36 | #define NCCL_COMM_NULL NULL |
| 37 | |
| 38 | #define NCCL_UNIQUE_ID_BYTES 128 |
| 39 | /*! @brief Opaque unique id used to initialize communicators |
| 40 | @details The ncclUniqueId must be passed to all participating ranks */ |
| 41 | typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId; |
| 42 | |
| 43 | /*! @defgroup rccl_result_code Result Codes |
| 44 | @details The various result codes that RCCL API calls may return |
| 45 | @{ */ |
| 46 | |
| 47 | /*! @brief Result type |
| 48 | @details Return codes aside from ncclSuccess indicate that a call has failed */ |
| 49 | typedef enum { |
| 50 | ncclSuccess = 0, /*!< No error */ |
| 51 | ncclUnhandledCudaError = 1, /*!< Unhandled HIP error */ |
| 52 | ncclSystemError = 2, /*!< Unhandled system error */ |
| 53 | ncclInternalError = 3, /*!< Internal Error - Please report to RCCL developers */ |
| 54 | ncclInvalidArgument = 4, /*!< Invalid argument */ |
| 55 | ncclInvalidUsage = 5, /*!< Invalid usage */ |
| 56 | ncclRemoteError = 6, /*!< Remote process exited or there was a network error */ |
| 57 | ncclInProgress = 7, /*!< RCCL operation in progress */ |
| 58 | ncclNumResults = 8 /*!< Number of result types */ |
| 59 | } ncclResult_t; |
| 60 | /*! @} */ |
| 61 | |
| 62 | #define NCCL_CONFIG_UNDEF_INT INT_MIN |
| 63 | #define NCCL_CONFIG_UNDEF_PTR NULL |
| 64 | #define NCCL_SPLIT_NOCOLOR -1 |
| 65 | |
| 66 | /*! @defgroup rccl_config_type Communicator Configuration |
| 67 | @details Structure that allows for customizing Communicator behavior via ncclCommInitRankConfig |
| 68 | @{ */ |
| 69 | |
| 70 | /*! @brief Communicator configuration |
| 71 | @details Users can assign value to attributes to specify the behavior of a communicator */ |
| 72 | typedef struct ncclConfig_v21700 { |
| 73 | /* attributes that users should never touch. */ |
| 74 | size_t size; /*!< Should not be touched */ |
| 75 | unsigned int magic; /*!< Should not be touched */ |
| 76 | unsigned int version; /*!< Should not be touched */ |
| 77 | /* attributes that users are able to customize. */ |
| 78 | int blocking; /*!< Whether or not calls should block or not */ |
| 79 | int cgaClusterSize; /*!< Cooperative group array cluster size */ |
| 80 | int minCTAs; /*!< Minimum number of cooperative thread arrays (blocks) */ |
| 81 | int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */ |
| 82 | const char *netName; /*!< Force NCCL to use a specfic network */ |
| 83 | int splitShare; /*!< Allow communicators to share resources */ |
| 84 | } ncclConfig_t; |
| 85 | |
| 86 | /* Config initializer must be assigned to initialize config structure when it is created. |
| 87 | * Not initialized config will result in an error. */ |
| 88 | #define NCCL_CONFIG_INITIALIZER { \ |
| 89 | sizeof(ncclConfig_t), /* size */ \ |
| 90 | 0xcafebeef, /* magic */ \ |
| 91 | NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ |
| 92 | NCCL_CONFIG_UNDEF_INT, /* blocking */ \ |
| 93 | NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ |
| 94 | NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ |
| 95 | NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ |
| 96 | NCCL_CONFIG_UNDEF_PTR, /* netName */ \ |
| 97 | NCCL_CONFIG_UNDEF_INT /* splitShare */ \ |
| 98 | } |
| 99 | /*! @} */ |
| 100 | |
| 101 | /*! @defgroup rccl_api_version Version Information |
| 102 | @details API call that returns RCCL version |
| 103 | @{ */ |
| 104 | |
| 105 | /*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer. |
| 106 | @details This integer is coded with the MAJOR, MINOR and PATCH level of RCCL. |
| 107 | @return Result code. See @ref rccl_result_code for more details. |
| 108 | |
| 109 | @param[out] version Pointer to where version will be stored */ |
| 110 | ncclResult_t ncclGetVersion(int *version); |
| 111 | /*! @cond include_hidden */ |
| 112 | ncclResult_t pncclGetVersion(int *version); |
| 113 | /*! @endcond */ |
| 114 | /*! @} */ |
| 115 | |
| 116 | /*! @defgroup rccl_api_communicator Communicator Initialization/Destruction |
| 117 | @details API calls that operate on communicators. |
| 118 | Communicators objects are used to launch collective communication |
| 119 | operations. Unique ranks between 0 and N-1 must be assigned to |
| 120 | each HIP device participating in the same Communicator. |
| 121 | Using the same HIP device for multiple ranks of the same Communicator |
| 122 | is not supported at this time. |
| 123 | @{ */ |
| 124 | |
| 125 | /*! @brief Generates an ID for ncclCommInitRank. |
| 126 | @details Generates an ID to be used in ncclCommInitRank. |
| 127 | ncclGetUniqueId should be called once by a single rank and the |
| 128 | ID should be distributed to all ranks in the communicator before |
| 129 | using it as a parameter for ncclCommInitRank. |
| 130 | @return Result code. See @ref rccl_result_code for more details. |
| 131 | |
| 132 | @param[out] uniqueId Pointer to where uniqueId will be stored */ |
| 133 | ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); |
| 134 | /*! @cond include_hidden */ |
| 135 | ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); |
| 136 | /*! @endcond */ |
| 137 | |
| 138 | /*! @brief Create a new communicator with config. |
| 139 | @details Create a new communicator (multi thread/process version) with a configuration |
| 140 | set by users. See @ref rccl_config_type for more details. |
| 141 | Each rank is associated to a CUDA device, which has to be set before calling |
| 142 | ncclCommInitRank. |
| 143 | @return Result code. See @ref rccl_result_code for more details. |
| 144 | |
| 145 | @param[out] comm Pointer to created communicator |
| 146 | @param[in] nranks Total number of ranks participating in this communicator |
| 147 | @param[in] commId UniqueId required for initialization |
| 148 | @param[in] rank Current rank to create communicator for. [0 to nranks-1] |
| 149 | @param[in] config Pointer to communicator configuration */ |
| 150 | ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); |
| 151 | /*! @cond include_hidden */ |
| 152 | ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); |
| 153 | /*! @endcond */ |
| 154 | |
| 155 | /*! @brief Creates a new communicator (multi thread/process version). |
| 156 | @details Rank must be between 0 and nranks-1 and unique within a communicator clique. |
| 157 | Each rank is associated to a CUDA device, which has to be set before calling |
| 158 | ncclCommInitRank. ncclCommInitRank implicitly syncronizes with other ranks, |
| 159 | so it must be called by different threads/processes or use ncclGroupStart/ncclGroupEnd. |
| 160 | @return Result code. See @ref rccl_result_code for more details. |
| 161 | |
| 162 | @param[out] comm Pointer to created communicator |
| 163 | @param[in] nranks Total number of ranks participating in this communicator |
| 164 | @param[in] commId UniqueId required for initialization |
| 165 | @param[in] rank Current rank to create communicator for */ |
| 166 | ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); |
| 167 | /*! @cond include_hidden */ |
| 168 | ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); |
| 169 | /*! @endcond */ |
| 170 | |
| 171 | /*! @brief Creates a clique of communicators (single process version). |
| 172 | @details This is a convenience function to create a single-process communicator clique. |
| 173 | Returns an array of ndev newly initialized communicators in comm. |
| 174 | comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). |
| 175 | If devlist is NULL, the first ndev HIP devices are used. |
| 176 | Order of devlist defines user-order of processors within the communicator. |
| 177 | @return Result code. See @ref rccl_result_code for more details. |
| 178 | |
| 179 | @param[out] comm Pointer to array of created communicators |
| 180 | @param[in] ndev Total number of ranks participating in this communicator |
| 181 | @param[in] devlist Array of GPU device indices to create for */ |
| 182 | ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); |
| 183 | /*! @cond include_hidden */ |
| 184 | ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); |
| 185 | /*! @endcond */ |
| 186 | |
| 187 | /*! @brief Finalize a communicator. |
| 188 | @details ncclCommFinalize flushes all issued communications |
| 189 | and marks communicator state as ncclInProgress. The state will change to ncclSuccess |
| 190 | when the communicator is globally quiescent and related resources are freed; then, |
| 191 | calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator |
| 192 | itself) without blocking. |
| 193 | @return Result code. See @ref rccl_result_code for more details. |
| 194 | |
| 195 | @param[in] comm Communicator to finalize */ |
| 196 | ncclResult_t ncclCommFinalize(ncclComm_t comm); |
| 197 | /*! @cond include_hidden */ |
| 198 | ncclResult_t pncclCommFinalize(ncclComm_t comm); |
| 199 | /*! @endcond */ |
| 200 | |
| 201 | /*! @brief Frees local resources associated with communicator object. |
| 202 | @details Destroy all local resources associated with the passed in communicator object |
| 203 | @return Result code. See @ref rccl_result_code for more details. |
| 204 | |
| 205 | @param[in] comm Communicator to destroy */ |
| 206 | ncclResult_t ncclCommDestroy(ncclComm_t comm); |
| 207 | /*! @cond include_hidden */ |
| 208 | ncclResult_t pncclCommDestroy(ncclComm_t comm); |
| 209 | /*! @endcond */ |
| 210 | |
| 211 | /*! @brief Abort any in-progress calls and destroy the communicator object. |
| 212 | @details Frees resources associated with communicator object and aborts any operations |
| 213 | that might still be running on the device. |
| 214 | @return Result code. See @ref rccl_result_code for more details. |
| 215 | |
| 216 | @param[in] comm Communicator to abort and destroy */ |
| 217 | ncclResult_t ncclCommAbort(ncclComm_t comm); |
| 218 | /*! @cond include_hidden */ |
| 219 | ncclResult_t pncclCommAbort(ncclComm_t comm); |
| 220 | /*! @endcond */ |
| 221 | |
| 222 | /*! @brief Create one or more communicators from an existing one. |
| 223 | @details Creates one or more communicators from an existing one. |
| 224 | Ranks with the same color will end up in the same communicator. |
| 225 | Within the new communicator, key will be used to order ranks. |
| 226 | NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group |
| 227 | and will therefore return a NULL communicator. |
| 228 | If config is NULL, the new communicator will inherit the original communicator's configuration |
| 229 | @return Result code. See @ref rccl_result_code for more details. |
| 230 | |
| 231 | @param[in] comm Original communicator object for this rank |
| 232 | @param[in] color Color to assign this rank |
| 233 | @param[in] key Key used to order ranks within the same new communicator |
| 234 | @param[out] newcomm Pointer to new communicator |
| 235 | @param[in] config Config file for new communicator. May be NULL to inherit from comm */ |
| 236 | ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); |
| 237 | /*! @cond include_hidden */ |
| 238 | ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); |
| 239 | /*! @endcond */ |
| 240 | /*! @} */ |
| 241 | |
| 242 | /*! @defgroup rccl_api_errcheck Error Checking Calls |
| 243 | @details API calls that check for errors |
| 244 | @{ */ |
| 245 | |
| 246 | /*! @brief Returns a string for each result code. |
| 247 | @details Returns a human-readable string describing the given result code. |
| 248 | @return String containing description of result code. |
| 249 | |
| 250 | @param[in] result Result code to get description for */ |
| 251 | const char* ncclGetErrorString(ncclResult_t result); |
| 252 | /*! @cond include_hidden */ |
| 253 | const char* pncclGetErrorString(ncclResult_t result); |
| 254 | /*! @endcond */ |
| 255 | |
| 256 | /*! @brief Returns mesage on last result that occured. |
| 257 | @details Returns a human-readable message of the last error that occurred. |
| 258 | @return String containing the last result |
| 259 | |
| 260 | @param[in] comm is currently unused and can be set to NULL */ |
| 261 | const char* ncclGetLastError(ncclComm_t comm); |
| 262 | /*! @cond include_hidden */ |
| 263 | const char* pncclGetLastError(ncclComm_t comm); |
| 264 | /*! @endcond */ |
| 265 | |
| 266 | /*! @brief Checks whether the comm has encountered any asynchronous errors |
| 267 | @details Query whether the provided communicator has encountered any asynchronous errors |
| 268 | @return Result code. See @ref rccl_result_code for more details. |
| 269 | |
| 270 | @param[in] comm Communicator to query |
| 271 | @param[out] asyncError Pointer to where result code will be stored */ |
| 272 | ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); |
| 273 | /*! @cond include_hidden */ |
| 274 | ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); |
| 275 | /*! @endcond */ |
| 276 | /*! @} */ |
| 277 | |
| 278 | /*! @defgroup rccl_api_comminfo Communicator Information |
| 279 | @details API calls that query communicator information |
| 280 | @{ */ |
| 281 | |
| 282 | /*! @brief Gets the number of ranks in the communicator clique. |
| 283 | @details Returns the number of ranks in the communicator clique (as set during initialization) |
| 284 | @return Result code. See @ref rccl_result_code for more details. |
| 285 | |
| 286 | @param[in] comm Communicator to query |
| 287 | @param[out] count Pointer to where number of ranks will be stored */ |
| 288 | ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); |
| 289 | /*! @cond include_hidden */ |
| 290 | ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); |
| 291 | /*~ @endcond */ |
| 292 | |
| 293 | /*! @brief Get the ROCm device index associated with a communicator |
| 294 | @details Returns the ROCm device number associated with the provided communicator. |
| 295 | @return Result code. See @ref rccl_result_code for more details. |
| 296 | |
| 297 | @param[in] comm Communicator to query |
| 298 | @param[out] device Pointer to where the associated ROCm device index will be stored */ |
| 299 | ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); |
| 300 | /*! @cond include_hidden */ |
| 301 | ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); |
| 302 | /*! @endcond */ |
| 303 | |
| 304 | /*! @brief Get the rank associated with a communicator |
| 305 | @details Returns the user-ordered "rank" associated with the provided communicator. |
| 306 | @return Result code. See @ref rccl_result_code for more details. |
| 307 | |
| 308 | @param[in] comm Communicator to query |
| 309 | @param[out] rank Pointer to where the associated rank will be stored */ |
| 310 | ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); |
| 311 | /*! @cond include_hidden */ |
| 312 | ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); |
| 313 | /*! @endcond */ |
| 314 | /*! @} */ |
| 315 | |
| 316 | /*! @defgroup rccl_api_enumerations API Enumerations |
| 317 | @details Enumerations used by collective communication calls |
| 318 | @{ */ |
| 319 | |
| 320 | /*! @brief Dummy reduction enumeration |
| 321 | @details Dummy reduction enumeration used to determine value for ncclMaxRedOp */ |
| 322 | typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; |
| 323 | |
| 324 | /*! @brief Reduction operation selector |
| 325 | @details Enumeration used to specify the various reduction operations |
| 326 | ncclNumOps is the number of built-in ncclRedOp_t values and serves as |
| 327 | the least possible value for dynamic ncclRedOp_t values constructed by |
| 328 | ncclRedOpCreate functions. |
| 329 | |
| 330 | ncclMaxRedOp is the largest valid value for ncclRedOp_t and is defined |
| 331 | to be the largest signed value (since compilers are permitted to use |
| 332 | signed enums) that won't grow sizeof(ncclRedOp_t) when compared to previous |
| 333 | RCCL versions to maintain ABI compatibility. */ |
| 334 | typedef enum { ncclSum = 0, /*!< Sum */ |
| 335 | ncclProd = 1, /*!< Product */ |
| 336 | ncclMax = 2, /*!< Max */ |
| 337 | ncclMin = 3, /*!< Min */ |
| 338 | ncclAvg = 4, /*!< Average */ |
| 339 | ncclNumOps = 5, /*!< Number of built-in reduction ops */ |
| 340 | ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) /*!< Largest value for ncclRedOp_t */ |
| 341 | } ncclRedOp_t; |
| 342 | |
| 343 | /*! @brief Data types |
| 344 | @details Enumeration of the various supported datatype */ |
| 345 | typedef enum { ncclInt8 = 0, ncclChar = 0, |
| 346 | ncclUint8 = 1, |
| 347 | ncclInt32 = 2, ncclInt = 2, |
| 348 | ncclUint32 = 3, |
| 349 | ncclInt64 = 4, |
| 350 | ncclUint64 = 5, |
| 351 | ncclFloat16 = 6, ncclHalf = 6, |
| 352 | ncclFloat32 = 7, ncclFloat = 7, |
| 353 | ncclFloat64 = 8, ncclDouble = 8, |
| 354 | ncclBfloat16 = 9, |
| 355 | ncclNumTypes = 10 } ncclDataType_t; |
| 356 | /*! @} */ |
| 357 | |
| 358 | /*! @defgroup rccl_api_custom_redop Custom Reduction Operator |
| 359 | @details API calls relating to creation/destroying custom reduction operator |
| 360 | that pre-multiplies local source arrays prior to reduction |
| 361 | @{ */ |
| 362 | |
| 363 | /*! @brief Location and dereferencing logic for scalar arguments. |
| 364 | @details Enumeration specifying memory location of the scalar argument. |
| 365 | Based on where the value is stored, the argument will be dereferenced either |
| 366 | while the collective is running (if in device memory), or before the ncclRedOpCreate() |
| 367 | function returns (if in host memory). */ |
| 368 | typedef enum { |
| 369 | ncclScalarDevice = 0, /*!< Scalar is in device-visible memory */ |
| 370 | ncclScalarHostImmediate = 1 /*!< Scalar is in host-visible memory */ |
| 371 | } ncclScalarResidence_t; |
| 372 | |
| 373 | /*! @brief Create a custom pre-multiplier reduction operator |
| 374 | @details Creates a new reduction operator which pre-multiplies input values by a given |
| 375 | scalar locally before reducing them with peer values via summation. For use |
| 376 | only with collectives launched against *comm* and *datatype*. The |
| 377 | *residence* argument indicates how/when the memory pointed to by *scalar* |
| 378 | will be dereferenced. Upon return, the newly created operator's handle |
| 379 | is stored in *op*. |
| 380 | @return Result code. See @ref rccl_result_code for more details. |
| 381 | |
| 382 | @param[out] op Pointer to where newly created custom reduction operator is to be stored |
| 383 | @param[in] scalar Pointer to scalar value. |
| 384 | @param[in] datatype Scalar value datatype |
| 385 | @param[in] residence Memory type of the scalar value |
| 386 | @param[in] comm Communicator to associate with this custom reduction operator */ |
| 387 | ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); |
| 388 | /*! @cond include_hidden */ |
| 389 | ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); |
| 390 | /*! @endcond */ |
| 391 | |
| 392 | /*! @brief Destroy custom reduction operator |
| 393 | @details Destroys the reduction operator *op*. The operator must have been created by |
| 394 | ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be |
| 395 | destroyed as soon as the last RCCL function which is given that operator returns. |
| 396 | @return Result code. See @ref rccl_result_code for more details. |
| 397 | |
| 398 | @param[in] op Custom reduction operator is to be destroyed |
| 399 | @param[in] comm Communicator associated with this reduction operator */ |
| 400 | ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); |
| 401 | /*! @cond include_hidden */ |
| 402 | ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); |
| 403 | /*! @endcond */ |
| 404 | /*! @} */ |
| 405 | |
| 406 | /*! @defgroup rccl_collective_api Collective Communication Operations |
| 407 | @details Collective communication operations must be called separately for each |
| 408 | communicator in a communicator clique. |
| 409 | |
| 410 | They return when operations have been enqueued on the HIP stream. |
| 411 | Since they may perform inter-CPU synchronization, each call has to be done |
| 412 | from a different thread or process, or need to use Group Semantics (see |
| 413 | below). |
| 414 | @{ */ |
| 415 | |
| 416 | /*! @brief Reduce |
| 417 | @details Reduces data arrays of length *count* in *sendbuff* into *recvbuff* using *op* |
| 418 | operation. |
| 419 | *recvbuff* may be NULL on all calls except for root device. |
| 420 | *root* is the rank (not the HIP device) where data will reside after the |
| 421 | operation is complete. |
| 422 | In-place operation will happen if sendbuff == recvbuff. |
| 423 | @return Result code. See @ref rccl_result_code for more details. |
| 424 | |
| 425 | @param[in] sendbuff Local device data buffer to be reduced |
| 426 | @param[out] recvbuff Data buffer where result is stored (only for *root* rank). May be null for other ranks. |
| 427 | @param[in] count Number of elements in every send buffer |
| 428 | @param[in] datatype Data buffer element datatype |
| 429 | @param[in] op Reduction operator type |
| 430 | @param[in] root Rank where result data array will be stored |
| 431 | @param[in] comm Communicator group object to execute on |
| 432 | @param[in] stream HIP stream to execute collective on */ |
| 433 | ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, |
| 434 | ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); |
| 435 | /*! @cond include_hidden */ |
| 436 | ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, |
| 437 | ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); |
| 438 | /*! @endcond */ |
| 439 | |
| 440 | /*! @brief (Deprecated) Broadcast (in-place) |
| 441 | @details Copies *count* values from *root* to all other devices. |
| 442 | root is the rank (not the CUDA device) where data resides before the |
| 443 | operation is started. |
| 444 | This operation is implicitly in-place. |
| 445 | @return Result code. See @ref rccl_result_code for more details. |
| 446 | |
| 447 | @param[in,out] buff Input array on *root* to be copied to other ranks. Output array for all ranks. |
| 448 | @param[in] count Number of elements in data buffer |
| 449 | @param[in] datatype Data buffer element datatype |
| 450 | @param[in] root Rank owning buffer to be copied to others |
| 451 | @param[in] comm Communicator group object to execute on |
| 452 | @param[in] stream HIP stream to execute collective on */ |
| 453 | ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, |
| 454 | ncclComm_t comm, hipStream_t stream); |
| 455 | /*! @cond include_hidden */ |
| 456 | ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, |
| 457 | ncclComm_t comm, hipStream_t stream); |
| 458 | /*! @endcond */ |
| 459 | |
| 460 | /*! @brief Broadcast |
| 461 | @details Copies *count* values from *sendbuff* on *root* to *recvbuff* on all devices. |
| 462 | *root* is the rank (not the HIP device) where data resides before the operation is started. |
| 463 | *sendbuff* may be NULL on ranks other than *root*. |
| 464 | In-place operation will happen if *sendbuff* == *recvbuff*. |
| 465 | @return Result code. See @ref rccl_result_code for more details. |
| 466 | |
| 467 | @param[in] sendbuff Data array to copy (if *root*). May be NULL for other ranks |
| 468 | @param[in] recvbuff Data array to store received array |
| 469 | @param[in] count Number of elements in data buffer |
| 470 | @param[in] datatype Data buffer element datatype |
| 471 | @param[in] root Rank of broadcast root |
| 472 | @param[in] comm Communicator group object to execute on |
| 473 | @param[in] stream HIP stream to execute collective on */ |
| 474 | ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, |
| 475 | ncclComm_t comm, hipStream_t stream); |
| 476 | /*! @cond include_hidden */ |
| 477 | ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, |
| 478 | ncclComm_t comm, hipStream_t stream); |
| 479 | /*! @endcond */ |
| 480 | |
| 481 | /*! @brief All-Reduce |
| 482 | @details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and |
| 483 | leaves identical copies of result on each *recvbuff*. |
| 484 | In-place operation will happen if sendbuff == recvbuff. |
| 485 | @return Result code. See @ref rccl_result_code for more details. |
| 486 | |
| 487 | @param[in] sendbuff Input data array to reduce |
| 488 | @param[out] recvbuff Data array to store reduced result array |
| 489 | @param[in] count Number of elements in data buffer |
| 490 | @param[in] datatype Data buffer element datatype |
| 491 | @param[in] op Reduction operator |
| 492 | @param[in] comm Communicator group object to execute on |
| 493 | @param[in] stream HIP stream to execute collective on */ |
| 494 | ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, |
| 495 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); |
| 496 | /*! @cond include_hidden */ |
| 497 | ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, |
| 498 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); |
| 499 | /*! @endcond */ |
| 500 | |
| 501 | /*! @brief Reduce-Scatter |
| 502 | @details Reduces data in *sendbuff* using *op* operation and leaves reduced result |
| 503 | scattered over the devices so that *recvbuff* on rank i will contain the i-th |
| 504 | block of the result. |
| 505 | Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff* |
| 506 | should have a size of at least nranks*recvcount elements. |
| 507 | In-place operations will happen if recvbuff == sendbuff + rank * recvcount. |
| 508 | @return Result code. See @ref rccl_result_code for more details. |
| 509 | |
| 510 | @param[in] sendbuff Input data array to reduce |
| 511 | @param[out] recvbuff Data array to store reduced result subarray |
| 512 | @param[in] recvcount Number of elements each rank receives |
| 513 | @param[in] datatype Data buffer element datatype |
| 514 | @param[in] op Reduction operator |
| 515 | @param[in] comm Communicator group object to execute on |
| 516 | @param[in] stream HIP stream to execute collective on */ |
| 517 | ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, |
| 518 | size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, |
| 519 | hipStream_t stream); |
| 520 | /*! @cond include_hidden */ |
| 521 | ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, |
| 522 | size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, |
| 523 | hipStream_t stream); |
| 524 | /*! @endcond */ |
| 525 | |
| 526 | /*! @brief All-Gather |
| 527 | @details Each device gathers *sendcount* values from other GPUs into *recvbuff*, |
| 528 | receiving data from rank i at offset i*sendcount. |
| 529 | Assumes recvcount is equal to nranks*sendcount, which means that recvbuff |
| 530 | should have a size of at least nranks*sendcount elements. |
| 531 | In-place operations will happen if sendbuff == recvbuff + rank * sendcount. |
| 532 | @return Result code. See @ref rccl_result_code for more details. |
| 533 | |
| 534 | @param[in] sendbuff Input data array to send |
| 535 | @param[out] recvbuff Data array to store the gathered result |
| 536 | @param[in] sendcount Number of elements each rank sends |
| 537 | @param[in] datatype Data buffer element datatype |
| 538 | @param[in] comm Communicator group object to execute on |
| 539 | @param[in] stream HIP stream to execute collective on */ |
| 540 | ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, |
| 541 | ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); |
| 542 | /*! @cond include_hidden */ |
| 543 | ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, |
| 544 | ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); |
| 545 | /*! @endcond */ |
| 546 | |
| 547 | /*! @brief Send |
| 548 | @details Send data from *sendbuff* to rank *peer*. |
| 549 | Rank *peer* needs to call ncclRecv with the same *datatype* and the same *count* |
| 550 | as this rank. |
| 551 | This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations |
| 552 | need to progress concurrently to complete, they must be fused within a ncclGroupStart / |
| 553 | ncclGroupEnd section. |
| 554 | @return Result code. See @ref rccl_result_code for more details. |
| 555 | |
| 556 | @param[in] sendbuff Data array to send |
| 557 | @param[in] count Number of elements to send |
| 558 | @param[in] datatype Data buffer element datatype |
| 559 | @param[in] peer Peer rank to send to |
| 560 | @param[in] comm Communicator group object to execute on |
| 561 | @param[in] stream HIP stream to execute collective on */ |
| 562 | ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, |
| 563 | ncclComm_t comm, hipStream_t stream); |
| 564 | /*! @cond include_hidden */ |
| 565 | ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, |
| 566 | ncclComm_t comm, hipStream_t stream); |
| 567 | /*! @endcond */ |
| 568 | |
| 569 | /*! @brief Receive |
| 570 | @details Receive data from rank *peer* into *recvbuff*. |
| 571 | Rank *peer* needs to call ncclSend with the same datatype and the same count |
| 572 | as this rank. |
| 573 | This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations |
| 574 | need to progress concurrently to complete, they must be fused within a ncclGroupStart/ |
| 575 | ncclGroupEnd section. |
| 576 | @return Result code. See @ref rccl_result_code for more details. |
| 577 | |
| 578 | @param[out] recvbuff Data array to receive |
| 579 | @param[in] count Number of elements to receive |
| 580 | @param[in] datatype Data buffer element datatype |
| 581 | @param[in] peer Peer rank to send to |
| 582 | @param[in] comm Communicator group object to execute on |
| 583 | @param[in] stream HIP stream to execute collective on */ |
| 584 | ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, |
| 585 | ncclComm_t comm, hipStream_t stream); |
| 586 | /*! @cond include_hidden */ |
| 587 | ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, |
| 588 | ncclComm_t comm, hipStream_t stream); |
| 589 | /*! @endcond */ |
| 590 | |
| 591 | /*! @brief Gather |
| 592 | @details Root device gathers *sendcount* values from other GPUs into *recvbuff*, |
| 593 | receiving data from rank i at offset i*sendcount. |
| 594 | Assumes recvcount is equal to nranks*sendcount, which means that *recvbuff* |
| 595 | should have a size of at least nranks*sendcount elements. |
| 596 | In-place operations will happen if sendbuff == recvbuff + rank * sendcount. |
| 597 | *recvbuff* may be NULL on ranks other than *root*. |
| 598 | @return Result code. See @ref rccl_result_code for more details. |
| 599 | |
| 600 | @param[in] sendbuff Data array to send |
| 601 | @param[out] recvbuff Data array to receive into on *root*. |
| 602 | @param[in] sendcount Number of elements to send per rank |
| 603 | @param[in] datatype Data buffer element datatype |
| 604 | @param[in] root Rank that receives data from all other ranks |
| 605 | @param[in] comm Communicator group object to execute on |
| 606 | @param[in] stream HIP stream to execute collective on */ |
| 607 | ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, |
| 608 | ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); |
| 609 | /*! @cond include_hidden */ |
| 610 | ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, |
| 611 | ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); |
| 612 | /*! @endcond */ |
| 613 | |
| 614 | /*! @brief Scatter |
| 615 | @details Scattered over the devices so that recvbuff on rank i will contain the i-th |
| 616 | block of the data on root. |
| 617 | Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff* |
| 618 | should have a size of at least nranks*recvcount elements. |
| 619 | In-place operations will happen if recvbuff == sendbuff + rank * recvcount. |
| 620 | @return Result code. See @ref rccl_result_code for more details. |
| 621 | |
| 622 | @param[in] sendbuff Data array to send (on *root* rank). May be NULL on other ranks. |
| 623 | @param[out] recvbuff Data array to receive partial subarray into |
| 624 | @param[in] recvcount Number of elements to receive per rank |
| 625 | @param[in] datatype Data buffer element datatype |
| 626 | @param[in] root Rank that scatters data to all other ranks |
| 627 | @param[in] comm Communicator group object to execute on |
| 628 | @param[in] stream HIP stream to execute collective on */ |
| 629 | ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, |
| 630 | size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, |
| 631 | hipStream_t stream); |
| 632 | /*! @cond include_hidden */ |
| 633 | ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, |
| 634 | size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, |
| 635 | hipStream_t stream); |
| 636 | /*! @endcond */ |
| 637 | |
| 638 | /*! @brief All-To-All |
| 639 | @details Device (i) send (j)th block of data to device (j) and be placed as (i)th |
| 640 | block. Each block for sending/receiving has *count* elements, which means |
| 641 | that *recvbuff* and *sendbuff* should have a size of nranks*count elements. |
| 642 | In-place operation is NOT supported. It is the user's responsibility |
| 643 | to ensure that sendbuff and recvbuff are distinct. |
| 644 | @return Result code. See @ref rccl_result_code for more details. |
| 645 | |
| 646 | @param[in] sendbuff Data array to send (contains blocks for each other rank) |
| 647 | @param[out] recvbuff Data array to receive (contains blocks from each other rank) |
| 648 | @param[in] count Number of elements to send between each pair of ranks |
| 649 | @param[in] datatype Data buffer element datatype |
| 650 | @param[in] comm Communicator group object to execute on |
| 651 | @param[in] stream HIP stream to execute collective on */ |
| 652 | ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, |
| 653 | ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); |
| 654 | /*! @cond include_hidden */ |
| 655 | ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, |
| 656 | ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); |
| 657 | /*! @endcond */ |
| 658 | |
| 659 | /*! @brief All-To-Allv |
| 660 | @details Device (i) sends sendcounts[j] of data from offset sdispls[j] |
| 661 | to device (j). At the same time, device (i) receives recvcounts[j] of data |
| 662 | from device (j) to be placed at rdispls[j]. |
| 663 | sendcounts, sdispls, recvcounts and rdispls are all measured in the units |
| 664 | of datatype, not bytes. |
| 665 | In-place operation will happen if sendbuff == recvbuff. |
| 666 | @return Result code. See @ref rccl_result_code for more details. |
| 667 | |
| 668 | @param[in] sendbuff Data array to send (contains blocks for each other rank) |
| 669 | @param[in] sendcounts Array containing number of elements to send to each participating rank |
| 670 | @param[in] sdispls Array of offsets into *sendbuff* for each participating rank |
| 671 | @param[out] recvbuff Data array to receive (contains blocks from each other rank) |
| 672 | @param[in] recvcounts Array containing number of elements to receive from each participating rank |
| 673 | @param[in] rdispls Array of offsets into *recvbuff* for each participating rank |
| 674 | @param[in] datatype Data buffer element datatype |
| 675 | @param[in] comm Communicator group object to execute on |
| 676 | @param[in] stream HIP stream to execute collective on */ |
| 677 | ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], |
| 678 | const size_t sdispls[], void *recvbuff, const size_t recvcounts[], |
| 679 | const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); |
| 680 | /*! @cond include_hidden */ |
| 681 | ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[], |
| 682 | const size_t sdispls[], void *recvbuff, const size_t recvcounts[], |
| 683 | const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); |
| 684 | /*! @endcond */ |
| 685 | |
| 686 | /*! @} */ |
| 687 | |
| 688 | /*! @defgroup msccl_api MSCCL Algorithm |
| 689 | @details API calls relating to the optional MSCCL algorithm datapath |
| 690 | @{ */ |
| 691 | |
| 692 | /*! @brief Opaque handle to MSCCL algorithm */ |
| 693 | typedef int mscclAlgoHandle_t; |
| 694 | |
| 695 | /*! @brief MSCCL Load Algorithm |
| 696 | @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return |
| 697 | its handle via mscclAlgoHandle. This API is expected to be called by MSCCL |
| 698 | scheduler instead of end users. |
| 699 | @return Result code. See @ref rccl_result_code for more details. |
| 700 | |
| 701 | @param[in] mscclAlgoFilePath Path to MSCCL algorithm file |
| 702 | @param[out] mscclAlgoHandle Returned handle to MSCCL algorithm |
| 703 | @param[in] rank Current rank */ |
| 704 | ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); |
| 705 | /*! @cond include_hidden */ |
| 706 | ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); |
| 707 | /*! @endcond */ |
| 708 | |
| 709 | /*! @brief MSCCL Run Algorithm |
| 710 | @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter |
| 711 | list merges all possible parameters required by different operations as this |
| 712 | is a general-purposed API. This API is expected to be called by MSCCL |
| 713 | scheduler instead of end users. |
| 714 | @return Result code. See @ref rccl_result_code for more details. |
| 715 | |
| 716 | @param[in] sendBuff Data array to send |
| 717 | @param[in] sendCounts Array containing number of elements to send to each participating rank |
| 718 | @param[in] sDisPls Array of offsets into *sendbuff* for each participating rank |
| 719 | @param[out] recvBuff Data array to receive |
| 720 | @param[in] recvCounts Array containing number of elements to receive from each participating rank |
| 721 | @param[in] rDisPls Array of offsets into *recvbuff* for each participating rank |
| 722 | @param[in] count Number of elements |
| 723 | @param[in] dataType Data buffer element datatype |
| 724 | @param[in] root Root rank index |
| 725 | @param[in] peer Peer rank index |
| 726 | @param[in] op Reduction operator |
| 727 | @param[in] mscclAlgoHandle Handle to MSCCL algorithm |
| 728 | @param[in] comm Communicator group object to execute on |
| 729 | @param[in] stream HIP stream to execute collective on */ |
| 730 | ncclResult_t mscclRunAlgo( |
| 731 | const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], |
| 732 | void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], |
| 733 | size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, |
| 734 | mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream); |
| 735 | /*! @cond include_hidden */ |
| 736 | ncclResult_t pmscclRunAlgo( |
| 737 | const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], |
| 738 | void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], |
| 739 | size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, |
| 740 | mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream); |
| 741 | /*! @endcond */ |
| 742 | |
| 743 | /*! @brief MSCCL Unload Algorithm |
| 744 | @details Unload MSCCL algorithm previous loaded using its handle. This API |
| 745 | is expected to be called by MSCCL scheduler instead of end users. |
| 746 | @return Result code. See @ref rccl_result_code for more details. |
| 747 | |
| 748 | @param[in] mscclAlgoHandle Handle to MSCCL algorithm to unload |
| 749 | */ |
| 750 | ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); |
| 751 | /*! @cond include_hidden */ |
| 752 | ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); |
| 753 | /*! @endcond */ |
| 754 | /*! @} */ |
| 755 | |
| 756 | |
| 757 | /*! @defgroup rccl_group_api Group semantics |
| 758 | @details When managing multiple GPUs from a single thread, and since RCCL collective |
| 759 | calls may perform inter-CPU synchronization, we need to "group" calls for |
| 760 | different ranks/devices into a single call. |
| 761 | |
| 762 | Grouping RCCL calls as being part of the same collective operation is done |
| 763 | using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all |
| 764 | collective calls until the ncclGroupEnd call, which will wait for all calls |
| 765 | to be complete. Note that for collective communication, ncclGroupEnd only |
| 766 | guarantees that the operations are enqueued on the streams, not that |
| 767 | the operation is effectively done. |
| 768 | |
| 769 | Both collective communication and ncclCommInitRank can be used in conjunction |
| 770 | of ncclGroupStart/ncclGroupEnd, but not together. |
| 771 | |
| 772 | Group semantics also allow to fuse multiple operations on the same device |
| 773 | to improve performance (for aggregated collective calls), or to permit |
| 774 | concurrent progress of multiple send/receive operations. |
| 775 | @{ */ |
| 776 | |
| 777 | /*! @brief Group Start |
| 778 | @details Start a group call. All calls to RCCL until ncclGroupEnd will be fused into |
| 779 | a single RCCL operation. Nothing will be started on the HIP stream until |
| 780 | ncclGroupEnd. |
| 781 | @return Result code. See @ref rccl_result_code for more details. */ |
| 782 | ncclResult_t ncclGroupStart(); |
| 783 | /*! @cond include_hidden */ |
| 784 | ncclResult_t pncclGroupStart(); |
| 785 | /*! @endcond */ |
| 786 | |
| 787 | /*! @brief Group End |
| 788 | @details End a group call. Start a fused RCCL operation consisting of all calls since |
| 789 | ncclGroupStart. Operations on the HIP stream depending on the RCCL operations |
| 790 | need to be called after ncclGroupEnd. |
| 791 | @return Result code. See @ref rccl_result_code for more details. */ |
| 792 | ncclResult_t ncclGroupEnd(); |
| 793 | /*! @cond include_hidden */ |
| 794 | ncclResult_t pncclGroupEnd(); |
| 795 | /*! @endcond */ |
| 796 | /*! @} */ |
| 797 | |
| 798 | #ifdef __cplusplus |
| 799 | } // end extern "C" |
| 800 | #endif |
| 801 | |
| 802 | #endif // end include guard |