done with the design

This commit is contained in:
Saeed Maleki (saemal)
2023-03-06 12:26:46 -08:00
parent b663469bcd
commit dced4c4c14
5 changed files with 11 additions and 4 deletions

View File

@@ -165,8 +165,9 @@ struct mscclppConn {
int remoteRank;
int buffSize;
mscclppTrigger *cpuTriggerFifo;
int* fifoHead; // indicates where CPU needs to read work elements. Write by CPU only, read by both
int* fifoTail; // indicates where GPU needs to write work elements. Write by GPU only, read by both
// fifoTail indicates where CPU needs to read the head of the fifo. only accessible by CPU
// No atomicity is required for fifoTail as only a single CPU thread accesses it.
int fifoTail;
uint64_t *remoteProxyFlag;
uint64_t *cpuProxyFlag;
void *cpuTriggerFifoGdrDesc;

View File

@@ -79,6 +79,7 @@ struct mscclppDevConn {
// // localBuff[srcOffset..srcOffset+size-1] <- remoteBuff[dstOffset..dstOffset+size-1]
// virtual void pullRemoteBuff(size_t srcOffset, size_t dstOffset, size_t size);
int* triggerFifoHead; // indicates the tail of the fifo. only accessible by the gpu. for parallel, access use atomic
mscclppTrigger* trigger;
uint64_t* proxyFlag;
};

View File

@@ -179,7 +179,10 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, i
conn->devConn->localBuff = localBuff;
conn->devConn->localFlag = localFlag;
conn->devConn->tag = tag;
// TODO(saemal): these two should be shared for all P2P-DMA connections made from each GPU. Same for each IB driver.
MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuTriggerFifo, &conn->devConn->trigger, MSCCLPP_PROXY_FIFO_SIZE, &conn->cpuTriggerFifoGdrDesc));
MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->triggerFifoHead, 1));
conn->ibCtx = NULL;
conn->ibQp = NULL;

View File

@@ -59,7 +59,7 @@ void* mscclppProxyServiceP2P(void* _args) {
while (*run) {
// Poll to see if we are ready to send anything
trigger.value = *(volatile uint64_t *)conn->cpuTriggerFifo;
trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->triggerFifoHead]);
if (trigger.value == 0) continue;
// Iterate over what send is needed
@@ -79,6 +79,7 @@ void* mscclppProxyServiceP2P(void* _args) {
// send completion
volatile uint64_t *tmp = (volatile uint64_t *)conn->cpuTriggerFifo;
*tmp = 0;
conn->triggerFifoHead++;
}
*run = 1;
PROXYCUDACHECK(cudaStreamDestroy(stream));

View File

@@ -42,7 +42,8 @@ __global__ void kernel(int rank, int world_size)
volatile uint64_t *localFlag = devConn.localFlag;
volatile uint64_t *remoteFlag = devConn.remoteFlag;
volatile uint64_t *proxyFlag = devConn.proxyFlag;
volatile uint64_t *trig = (volatile uint64_t *)devConn.trigger;
volatile uint64_t *trig = (volatile uint64_t *)devConn.triggerFifo[devConn.triggerFifoHead];
devConn.triggerFifoHead++;
uint64_t baseFlag = *localFlag;
if (threadIdx.x == 0) {