From ea71849dca69f8b6fb98937f9e5c8d19f7ec5e77 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 23 Mar 2023 02:23:15 +0000
Subject: [PATCH 1/3] more docs

---
 src/include/mscclpp.h | 49 +++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h
index 13381610..9f0cc725 100644
--- a/src/include/mscclpp.h
+++ b/src/include/mscclpp.h
@@ -23,32 +23,45 @@ extern "C" {
 /***************************************************************************************************************
  * A mscclppDevConn provides a zero-copy connection between a sender and a receiver that are
  * connected via P2P NVLink or IB.
- * The communication API is one-sided meaning that both side of a connection do not need to synchronize 
- * with each other for a single transfer. This is unlike NCCL/MSCCL where for each send instruction, there  
- * needs to be a matching receive instruction. MPI_Put and MPI_Get are the closest programming model 
- * in MSCCL++.
+ * The communication API is one-sided meaning that for every single data transfer, only one side
+ * needs to be execute unlike a two-sided communication stack such as NCCL where both sides 
+ * need to execute a send and a receive instruction for every transfer.
+ ***************************************************************************************************************
+ * At connection setup time, a sender and the matching receiver need to call mscclppConnect to register 
+ * their buffers locally. Once all buffers are registered via mscclppConnect, mscclppConnectionSetup is 
+ * called to setup a bidirectional connection. With every connection, there is an associated CPU 
+ * proxy thread that performs the actual data transfer.
+ ***************************************************************************************************************
+ * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the 
+ * proxy threads. There are currently two types of connections:
  * 
- * At connection setup, the sender and receiver register the respective buffers through mscclppConnect.
+ * P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency
+ * but has a higher bandwidth and costs no compute cycles on the GPU.
  * 
- * A connection communicates with a prody thread to perform the actual data transfer.  
- *    P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency
- *    but has a higher bandwidth and costs no compute cycles on the GPU.
- *    InfiniBand: the RDMA engine copies the data over via MLX devices.
- * 
- * Once the connection is setup, data transfer happens using the following functions:
- * 
- *  put:  The sender initiates a data transfer to the receiver.  
+ * InfiniBand: the RDMA engine copies the data over MLX devices.
+ ***************************************************************************************************************
+ * At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions:
+ * put:  The sender initiates a data transfer to the receiver.  
  * 
  * signal: The sender signals the receiver that data is ready to be consumed once the reciver has performed a wait().
  * 
  * wait: the reciever waits on the signal() to start reading the data.
  * 
- *    The sender should not reuse the buffer till the signal returns.
- *    The receiver should only access the data after the wait returns. 
+ * The sender should not reuse the buffer till the signal returns.
+ * The receiver should only access the data after the wait returns. 
  *   
- *  putWithSignal: The sender initiates a data transfer and signals the receiver that data is ready to be consumed. 
- *    This is an optimized version of a put followed by a signal.
- *  
+ * putWithSignal: The sender initiates a data transfer and signals the receiver that data is ready to be consumed. 
+ * This is an optimized version of a put followed by a signal.
+ * 
+ * Example:
+ * 
+ * // sender GPU
+ * devConn.put(data1)
+ * devConn.put(data2)
+ * devConn.put(data3)                                // receiver GPU
+ * // not OK to write to data1, data2, data3         // not OK to read data1, data2, data3   
+ * devConn.signal() -------------------------------> devConn.wait()
+ * // OK to write to data1, data2, data3             // OK to read data1, data2, data3
  **************************************************************************************************************/
 struct mscclppDevConn {
 #ifdef __CUDACC__

From ce660217b1c35805ff07bbb9d432d48efc325be5 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 23 Mar 2023 04:12:42 +0000
Subject: [PATCH 2/3] Update docs

---
 src/include/mscclpp.h | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h
index 9f0cc725..29ed0764 100644
--- a/src/include/mscclpp.h
+++ b/src/include/mscclpp.h
@@ -22,15 +22,16 @@ extern "C" {
 
 /***************************************************************************************************************
  * A mscclppDevConn provides a zero-copy connection between a sender and a receiver that are
- * connected via P2P NVLink or IB.
+ * connected via P2P NVLink or InfiniBand.
  * The communication API is one-sided meaning that for every single data transfer, only one side
- * needs to be execute unlike a two-sided communication stack such as NCCL where both sides 
- * need to execute a send and a receive instruction for every transfer.
+ * needs to execute unlike a two-sided communication stack such as NCCL where both sides 
+ * need to execute a send and a receive instruction respectively for every transfer.
  ***************************************************************************************************************
  * At connection setup time, a sender and the matching receiver need to call mscclppConnect to register 
  * their buffers locally. Once all buffers are registered via mscclppConnect, mscclppConnectionSetup is 
  * called to setup a bidirectional connection. With every connection, there is an associated CPU 
- * proxy thread that performs the actual data transfer.
+ * proxy thread that performs the actual data transfer using (R)DMA. DMA is optional for P2P NVLink connections
+ * where the GPU can perform the copy directly.
  ***************************************************************************************************************
  * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the 
  * proxy threads. There are currently two types of connections:
@@ -41,16 +42,17 @@ extern "C" {
  * InfiniBand: the RDMA engine copies the data over MLX devices.
  ***************************************************************************************************************
  * At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions:
- * put:  The sender initiates a data transfer to the receiver.  
  * 
- * signal: The sender signals the receiver that data is ready to be consumed once the reciver has performed a wait().
+ * put(): the sender initiates a data transfer to the receiver.  
  * 
- * wait: the reciever waits on the signal() to start reading the data.
+ * signal(): the sender signals the receiver that data is ready to be consumed once the reciver has performed a wait().
+ * 
+ * wait(): the reciever waits on the signal() to start reading the data.
  * 
  * The sender should not reuse the buffer till the signal returns.
  * The receiver should only access the data after the wait returns. 
  *   
- * putWithSignal: The sender initiates a data transfer and signals the receiver that data is ready to be consumed. 
+ * putWithSignal(): The sender initiates a data transfer and signals the receiver that data is ready to be consumed. 
  * This is an optimized version of a put followed by a signal.
  * 
  * Example:
@@ -68,6 +70,7 @@ struct mscclppDevConn {
  __forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize){
     fifo.push(mscclppData, dataOffset, dataSize);
   }
+
   __forceinline__ __device__ void signal(){
     epochIncrement();
     uint64_t curFifoHead = fifo.push(mscclppFlag | mscclppSync, 1, 1);
@@ -86,9 +89,9 @@ struct mscclppDevConn {
     while (*(volatile uint64_t*)proxyEpochId < (*recvEpochId));
   }
 
-   __forceinline__ __device__ void epochIncrement(){
-   *(volatile uint64_t*)sendEpochId += 1;
- }
+  __forceinline__ __device__ void epochIncrement(){
+    *(volatile uint64_t*)sendEpochId += 1;
+  }
 
 #endif
   int tag;

From 3e6bb0ec0c8ec1bd656f78bb71bbbf66f6d05550 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 23 Mar 2023 04:47:34 +0000
Subject: [PATCH 3/3] minor changes

---
 src/include/mscclpp.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h
index 29ed0764..a233eda3 100644
--- a/src/include/mscclpp.h
+++ b/src/include/mscclpp.h
@@ -25,7 +25,7 @@ extern "C" {
  * connected via P2P NVLink or InfiniBand.
  * The communication API is one-sided meaning that for every single data transfer, only one side
  * needs to execute unlike a two-sided communication stack such as NCCL where both sides 
- * need to execute a send and a receive instruction respectively for every transfer.
+ * need to execute a send and a receive instruction, respectively, for every transfer.
  ***************************************************************************************************************
  * At connection setup time, a sender and the matching receiver need to call mscclppConnect to register 
  * their buffers locally. Once all buffers are registered via mscclppConnect, mscclppConnectionSetup is 
@@ -33,7 +33,7 @@ extern "C" {
  * proxy thread that performs the actual data transfer using (R)DMA. DMA is optional for P2P NVLink connections
  * where the GPU can perform the copy directly.
  ***************************************************************************************************************
- * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the 
+ * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the
  * proxy threads. There are currently two types of connections:
  * 
  * P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency
@@ -43,16 +43,16 @@ extern "C" {
  ***************************************************************************************************************
  * At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions:
  * 
- * put(): the sender initiates a data transfer to the receiver.  
+ * put(): the sender initiates a data transfer to the receiver.
  * 
  * signal(): the sender signals the receiver that data is ready to be consumed once the reciver has performed a wait().
  * 
  * wait(): the reciever waits on the signal() to start reading the data.
  * 
  * The sender should not reuse the buffer till the signal returns.
- * The receiver should only access the data after the wait returns. 
+ * The receiver should only access the data after the wait returns.
  *   
- * putWithSignal(): The sender initiates a data transfer and signals the receiver that data is ready to be consumed. 
+ * putWithSignal(): the sender initiates a data transfer and signals the receiver that data is ready to be consumed.
  * This is an optimized version of a put followed by a signal.
  * 
  * Example:
@@ -61,7 +61,7 @@ extern "C" {
  * devConn.put(data1)
  * devConn.put(data2)
  * devConn.put(data3)                                // receiver GPU
- * // not OK to write to data1, data2, data3         // not OK to read data1, data2, data3   
+ * // not OK to write to data1, data2, data3         // not OK to read data1, data2, data3
  * devConn.signal() -------------------------------> devConn.wait()
  * // OK to write to data1, data2, data3             // OK to read data1, data2, data3
  **************************************************************************************************************/