#ifndef MSCCLPP_H_ #define MSCCLPP_H_ #include #include #if CUDART_VERSION >= 11000 #include #endif #include #define MSCCLPP_MAJOR 0 #define MSCCLPP_MINOR 1 #define MSCCLPP_PROXY_FIFO_SIZE 8 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 100 + MSCCLPP_MINOR) #ifdef __cplusplus extern "C" { #endif typedef enum : uint64_t { mscclppData = 0x1, mscclppFlag = 0x2, mscclppSync = 0x4} mscclppTriggerType_t; #define MSCCLPP_BITS_SIZE 32 #define MSCCLPP_BITS_OFFSET 32 #define MSCCLPP_BITS_TYPE 3 #define MSCCLPP_BITS_CONNID 10 // the summation of number of bits must be 128 or less union alignas(16) mscclppTrigger { uint64_t value[2]; struct { // first 64 bits: value[0] uint64_t dataSize : MSCCLPP_BITS_SIZE; uint64_t dataOffset : MSCCLPP_BITS_OFFSET; uint64_t : (64-MSCCLPP_BITS_SIZE-MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment // second 64 bits: value[1] uint64_t connId : MSCCLPP_BITS_CONNID; uint64_t type : MSCCLPP_BITS_TYPE; uint64_t : (64-MSCCLPP_BITS_CONNID-MSCCLPP_BITS_TYPE); // ensure 64-bit alignment } fields; }; typedef uint64_t mscclppRequest_t; typedef mscclppTrigger* mscclppTrigger_t; struct mscclppConcurrentFifo { #ifdef __CUDACC__ __forceinline__ __device__ mscclppRequest_t getTrigger(mscclppTrigger_t* trig) { uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->triggerFifoHead,1); while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->triggerFifoTail)); *trig = &this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE]; return curFifoHead; } __forceinline__ __device__ void setTrigger(mscclppTrigger_t trig, uint64_t type, uint64_t dataOffset, uint64_t dataSize) { asm volatile( "st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(&trig->value), "l"((dataOffset << (MSCCLPP_BITS_SIZE)) + (dataSize)), "l"((type << MSCCLPP_BITS_CONNID) + this->connId)); } __forceinline__ __device__ void waitTrigger(mscclppRequest_t req) { while (*(volatile uint64_t *)triggerFifoTail <= req); } #endif // __CUDACC__ mscclppTrigger* triggerFifo; uint64_t* triggerFifoTail; // read by both device and host. written only by host uint64_t* triggerFifoHead; // read by both device and host. written only by device int connId; }; /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between a sender and a receiver that are * connected via P2P NVLink or IB. * The communication API is one-sided meaning that not both side of a connection are involved * in a single transfer. This is unlike NCCL/MSCCL where for each send instruction, there needs * to be a matching receive instruction. MPI_Put and MPI_Get are the closest programming model * in MSCCL++. * * At connection setup, the sender and receiver register the respective buffers through mscclppConnect. * * After connection setup, if the connection type is: * P2P via NVLink: mscclppDevConn has access to remoteBuff and remoteFlag * InfiniBand: mscclppDevConn has no access to remoteBuff or remoteFlag * * For any connection, there is a proxy thread associated with it: * P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency * but has a higher bandwidth and costs no compute cycles on the GPU. * InfiniBand: the RDMA engine copies the data over via MLX devices. * * Memory consistency: * In general, there is no guarantee on the order in which bytes are received. MSCCL++ relies on the following * property to meet memory consistency: consecutive wirtes/reads by the CPU proxy are observed by the GPU in the same order * as they are issued in. This means that for a sequence of writes done by a CPU proxy, we need to write a synchornization * value written in flag that the receiving side of the GPU needs to poll on to ensure the arrival of writes. * * The communication from GPU to CPU proxy happens via trigger which is allocated on the GPU global memory and mounted on the CPU * with GDR copy. The CPU proxy has a fifo of work elements which are communicated via trigger. getTrigger gets a place on the fifo * (note that an atomicInc is used to enable concurrent calls to getTrigger). setTrigger rights the right work element to the fifo * so that the CPU proxy can consume it. * **************************************************************************************************************/ struct mscclppDevConn { int tag; void* localBuff; uint64_t* localFlag; void* remoteBuff; uint64_t* remoteFlag; uint64_t* proxyFlag; // this is only written by the proxy thread // multiple threads can access the fifo concurrently struct mscclppConcurrentFifo fifo; }; typedef struct mscclppComm* mscclppComm_t; typedef struct mscclppDevConn mscclppDevConn_t; #define MSCCLPP_UNIQUE_ID_BYTES 128 typedef struct { char internal[MSCCLPP_UNIQUE_ID_BYTES]; } mscclppUniqueId; /* Error type */ typedef enum { mscclppSuccess = 0, mscclppUnhandledCudaError = 1, mscclppSystemError = 2, mscclppInternalError = 3, mscclppInvalidArgument = 4, mscclppInvalidUsage = 5, mscclppRemoteError = 6, mscclppInProgress = 7, mscclppNumResults = 8 } mscclppResult_t; mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* uniqueId); /* Reduction operation selector */ typedef enum { mscclppNumOps_dummy = 5 } mscclppRedOp_dummy_t; typedef enum { mscclppSum = 0, mscclppProd = 1, mscclppMax = 2, mscclppMin = 3, mscclppAvg = 4, /* mscclppNumOps: The number of built-in mscclppRedOp_t values. Also * serves as the least possible value for dynamic mscclppRedOp_t's * as constructed by mscclppRedOpCreate*** functions. */ mscclppNumOps = 5, /* mscclppMaxRedOp: The largest valid value for mscclppRedOp_t. * It is defined to be the largest signed value (since compilers * are permitted to use signed enums) that won't grow * sizeof(mscclppRedOp_t) when compared to previous MSCCLPP versions to * maintain ABI compatibility. */ mscclppMaxRedOp = 0x7fffffff>>(32-8*sizeof(mscclppRedOp_dummy_t)) } mscclppRedOp_t; /* Data types */ typedef enum { mscclppInt8 = 0, mscclppChar = 0, mscclppUint8 = 1, mscclppInt32 = 2, mscclppInt = 2, mscclppUint32 = 3, mscclppInt64 = 4, mscclppUint64 = 5, mscclppFloat16 = 6, mscclppHalf = 6, mscclppFloat32 = 7, mscclppFloat = 7, mscclppFloat64 = 8, mscclppDouble = 8, #if defined(__CUDA_BF16_TYPES_EXIST__) mscclppBfloat16 = 9, mscclppNumTypes = 10 #else mscclppNumTypes = 9 #endif } mscclppDataType_t; /* Transport Types */ typedef enum { mscclppTransportP2P = 0, mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet mscclppTransportIB = 2, } mscclppTransport_t; mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, int rank, const char* ip_port_pair); mscclppResult_t mscclppBootStrapAllGather(mscclppComm_t comm, void* data, int size); mscclppResult_t mscclppCommDestroy(mscclppComm_t comm); mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, int remoteRank, void* localBuff, size_t buffSize, uint64_t* localFlag, int tag, mscclppTransport_t transportType, const char *ibDev=NULL); mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm); mscclppResult_t mscclppProxyLaunch(mscclppComm_t comm); mscclppResult_t mscclppProxyStop(mscclppComm_t comm); #ifdef __cplusplus } // end extern "C" #endif #endif // MSCCLPP_H_