mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 09:46:00 +00:00
done with the design
This commit is contained in:
@@ -165,8 +165,9 @@ struct mscclppConn {
|
||||
int remoteRank;
|
||||
int buffSize;
|
||||
mscclppTrigger *cpuTriggerFifo;
|
||||
int* fifoHead; // indicates where CPU needs to read work elements. Write by CPU only, read by both
|
||||
int* fifoTail; // indicates where GPU needs to write work elements. Write by GPU only, read by both
|
||||
// fifoTail indicates where CPU needs to read the head of the fifo. only accessible by CPU
|
||||
// No atomicity is required for fifoTail as only a single CPU thread accesses it.
|
||||
int fifoTail;
|
||||
uint64_t *remoteProxyFlag;
|
||||
uint64_t *cpuProxyFlag;
|
||||
void *cpuTriggerFifoGdrDesc;
|
||||
|
||||
@@ -79,6 +79,7 @@ struct mscclppDevConn {
|
||||
// // localBuff[srcOffset..srcOffset+size-1] <- remoteBuff[dstOffset..dstOffset+size-1]
|
||||
// virtual void pullRemoteBuff(size_t srcOffset, size_t dstOffset, size_t size);
|
||||
|
||||
int* triggerFifoHead; // indicates the tail of the fifo. only accessible by the gpu. for parallel, access use atomic
|
||||
mscclppTrigger* trigger;
|
||||
uint64_t* proxyFlag;
|
||||
};
|
||||
|
||||
@@ -179,7 +179,10 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, i
|
||||
conn->devConn->localBuff = localBuff;
|
||||
conn->devConn->localFlag = localFlag;
|
||||
conn->devConn->tag = tag;
|
||||
|
||||
// TODO(saemal): these two should be shared for all P2P-DMA connections made from each GPU. Same for each IB driver.
|
||||
MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuTriggerFifo, &conn->devConn->trigger, MSCCLPP_PROXY_FIFO_SIZE, &conn->cpuTriggerFifoGdrDesc));
|
||||
MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->triggerFifoHead, 1));
|
||||
|
||||
conn->ibCtx = NULL;
|
||||
conn->ibQp = NULL;
|
||||
|
||||
@@ -59,7 +59,7 @@ void* mscclppProxyServiceP2P(void* _args) {
|
||||
|
||||
while (*run) {
|
||||
// Poll to see if we are ready to send anything
|
||||
trigger.value = *(volatile uint64_t *)conn->cpuTriggerFifo;
|
||||
trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->triggerFifoHead]);
|
||||
if (trigger.value == 0) continue;
|
||||
|
||||
// Iterate over what send is needed
|
||||
@@ -79,6 +79,7 @@ void* mscclppProxyServiceP2P(void* _args) {
|
||||
// send completion
|
||||
volatile uint64_t *tmp = (volatile uint64_t *)conn->cpuTriggerFifo;
|
||||
*tmp = 0;
|
||||
conn->triggerFifoHead++;
|
||||
}
|
||||
*run = 1;
|
||||
PROXYCUDACHECK(cudaStreamDestroy(stream));
|
||||
|
||||
@@ -42,7 +42,8 @@ __global__ void kernel(int rank, int world_size)
|
||||
volatile uint64_t *localFlag = devConn.localFlag;
|
||||
volatile uint64_t *remoteFlag = devConn.remoteFlag;
|
||||
volatile uint64_t *proxyFlag = devConn.proxyFlag;
|
||||
volatile uint64_t *trig = (volatile uint64_t *)devConn.trigger;
|
||||
volatile uint64_t *trig = (volatile uint64_t *)devConn.triggerFifo[devConn.triggerFifoHead];
|
||||
devConn.triggerFifoHead++;
|
||||
uint64_t baseFlag = *localFlag;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
Reference in New Issue
Block a user