From ea71849dca69f8b6fb98937f9e5c8d19f7ec5e77 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 23 Mar 2023 02:23:15 +0000 Subject: [PATCH 1/3] more docs --- src/include/mscclpp.h | 49 +++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 13381610..9f0cc725 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -23,32 +23,45 @@ extern "C" { /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between a sender and a receiver that are * connected via P2P NVLink or IB. - * The communication API is one-sided meaning that both side of a connection do not need to synchronize - * with each other for a single transfer. This is unlike NCCL/MSCCL where for each send instruction, there - * needs to be a matching receive instruction. MPI_Put and MPI_Get are the closest programming model - * in MSCCL++. + * The communication API is one-sided meaning that for every single data transfer, only one side + * needs to be execute unlike a two-sided communication stack such as NCCL where both sides + * need to execute a send and a receive instruction for every transfer. + *************************************************************************************************************** + * At connection setup time, a sender and the matching receiver need to call mscclppConnect to register + * their buffers locally. Once all buffers are registered via mscclppConnect, mscclppConnectionSetup is + * called to setup a bidirectional connection. With every connection, there is an associated CPU + * proxy thread that performs the actual data transfer. + *************************************************************************************************************** + * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the + * proxy threads. There are currently two types of connections: * - * At connection setup, the sender and receiver register the respective buffers through mscclppConnect. + * P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency + * but has a higher bandwidth and costs no compute cycles on the GPU. * - * A connection communicates with a prody thread to perform the actual data transfer. - * P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency - * but has a higher bandwidth and costs no compute cycles on the GPU. - * InfiniBand: the RDMA engine copies the data over via MLX devices. - * - * Once the connection is setup, data transfer happens using the following functions: - * - * put: The sender initiates a data transfer to the receiver. + * InfiniBand: the RDMA engine copies the data over MLX devices. + *************************************************************************************************************** + * At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions: + * put: The sender initiates a data transfer to the receiver. * * signal: The sender signals the receiver that data is ready to be consumed once the reciver has performed a wait(). * * wait: the reciever waits on the signal() to start reading the data. * - * The sender should not reuse the buffer till the signal returns. - * The receiver should only access the data after the wait returns. + * The sender should not reuse the buffer till the signal returns. + * The receiver should only access the data after the wait returns. * - * putWithSignal: The sender initiates a data transfer and signals the receiver that data is ready to be consumed. - * This is an optimized version of a put followed by a signal. - * + * putWithSignal: The sender initiates a data transfer and signals the receiver that data is ready to be consumed. + * This is an optimized version of a put followed by a signal. + * + * Example: + * + * // sender GPU + * devConn.put(data1) + * devConn.put(data2) + * devConn.put(data3) // receiver GPU + * // not OK to write to data1, data2, data3 // not OK to read data1, data2, data3 + * devConn.signal() -------------------------------> devConn.wait() + * // OK to write to data1, data2, data3 // OK to read data1, data2, data3 **************************************************************************************************************/ struct mscclppDevConn { #ifdef __CUDACC__ From ce660217b1c35805ff07bbb9d432d48efc325be5 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 23 Mar 2023 04:12:42 +0000 Subject: [PATCH 2/3] Update docs --- src/include/mscclpp.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 9f0cc725..29ed0764 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -22,15 +22,16 @@ extern "C" { /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between a sender and a receiver that are - * connected via P2P NVLink or IB. + * connected via P2P NVLink or InfiniBand. * The communication API is one-sided meaning that for every single data transfer, only one side - * needs to be execute unlike a two-sided communication stack such as NCCL where both sides - * need to execute a send and a receive instruction for every transfer. + * needs to execute unlike a two-sided communication stack such as NCCL where both sides + * need to execute a send and a receive instruction respectively for every transfer. *************************************************************************************************************** * At connection setup time, a sender and the matching receiver need to call mscclppConnect to register * their buffers locally. Once all buffers are registered via mscclppConnect, mscclppConnectionSetup is * called to setup a bidirectional connection. With every connection, there is an associated CPU - * proxy thread that performs the actual data transfer. + * proxy thread that performs the actual data transfer using (R)DMA. DMA is optional for P2P NVLink connections + * where the GPU can perform the copy directly. *************************************************************************************************************** * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the * proxy threads. There are currently two types of connections: @@ -41,16 +42,17 @@ extern "C" { * InfiniBand: the RDMA engine copies the data over MLX devices. *************************************************************************************************************** * At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions: - * put: The sender initiates a data transfer to the receiver. * - * signal: The sender signals the receiver that data is ready to be consumed once the reciver has performed a wait(). + * put(): the sender initiates a data transfer to the receiver. * - * wait: the reciever waits on the signal() to start reading the data. + * signal(): the sender signals the receiver that data is ready to be consumed once the reciver has performed a wait(). + * + * wait(): the reciever waits on the signal() to start reading the data. * * The sender should not reuse the buffer till the signal returns. * The receiver should only access the data after the wait returns. * - * putWithSignal: The sender initiates a data transfer and signals the receiver that data is ready to be consumed. + * putWithSignal(): The sender initiates a data transfer and signals the receiver that data is ready to be consumed. * This is an optimized version of a put followed by a signal. * * Example: @@ -68,6 +70,7 @@ struct mscclppDevConn { __forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize){ fifo.push(mscclppData, dataOffset, dataSize); } + __forceinline__ __device__ void signal(){ epochIncrement(); uint64_t curFifoHead = fifo.push(mscclppFlag | mscclppSync, 1, 1); @@ -86,9 +89,9 @@ struct mscclppDevConn { while (*(volatile uint64_t*)proxyEpochId < (*recvEpochId)); } - __forceinline__ __device__ void epochIncrement(){ - *(volatile uint64_t*)sendEpochId += 1; - } + __forceinline__ __device__ void epochIncrement(){ + *(volatile uint64_t*)sendEpochId += 1; + } #endif int tag; From 3e6bb0ec0c8ec1bd656f78bb71bbbf66f6d05550 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 23 Mar 2023 04:47:34 +0000 Subject: [PATCH 3/3] minor changes --- src/include/mscclpp.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 29ed0764..a233eda3 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -25,7 +25,7 @@ extern "C" { * connected via P2P NVLink or InfiniBand. * The communication API is one-sided meaning that for every single data transfer, only one side * needs to execute unlike a two-sided communication stack such as NCCL where both sides - * need to execute a send and a receive instruction respectively for every transfer. + * need to execute a send and a receive instruction, respectively, for every transfer. *************************************************************************************************************** * At connection setup time, a sender and the matching receiver need to call mscclppConnect to register * their buffers locally. Once all buffers are registered via mscclppConnect, mscclppConnectionSetup is @@ -33,7 +33,7 @@ extern "C" { * proxy thread that performs the actual data transfer using (R)DMA. DMA is optional for P2P NVLink connections * where the GPU can perform the copy directly. *************************************************************************************************************** - * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the + * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the * proxy threads. There are currently two types of connections: * * P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency @@ -43,16 +43,16 @@ extern "C" { *************************************************************************************************************** * At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions: * - * put(): the sender initiates a data transfer to the receiver. + * put(): the sender initiates a data transfer to the receiver. * * signal(): the sender signals the receiver that data is ready to be consumed once the reciver has performed a wait(). * * wait(): the reciever waits on the signal() to start reading the data. * * The sender should not reuse the buffer till the signal returns. - * The receiver should only access the data after the wait returns. + * The receiver should only access the data after the wait returns. * - * putWithSignal(): The sender initiates a data transfer and signals the receiver that data is ready to be consumed. + * putWithSignal(): the sender initiates a data transfer and signals the receiver that data is ready to be consumed. * This is an optimized version of a put followed by a signal. * * Example: @@ -61,7 +61,7 @@ extern "C" { * devConn.put(data1) * devConn.put(data2) * devConn.put(data3) // receiver GPU - * // not OK to write to data1, data2, data3 // not OK to read data1, data2, data3 + * // not OK to write to data1, data2, data3 // not OK to read data1, data2, data3 * devConn.signal() -------------------------------> devConn.wait() * // OK to write to data1, data2, data3 // OK to read data1, data2, data3 **************************************************************************************************************/