diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index a47593eb..2a92e987 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -22,18 +22,23 @@ typedef enum : uint64_t { mscclppData = 0x1, mscclppFlag = 0x2, mscclppSync = 0x4} mscclppTriggerType_t; -#define MSCCLPP_SIZE_BITS 30 -#define MSCCLPP_OFFSET_BITS 31 +#define MSCCLPP_BITS_SIZE 30 +#define MSCCLPP_BITS_OFFSET 31 +#define MSCCLPP_BITS_CONNID 10 -#define TRIGGER_VALUE(__TYPE__,__OFFSET__,__SIZE__) (((((__TYPE__) << MSCCLPP_OFFSET_BITS) + (__OFFSET__)) << MSCCLPP_SIZE_BITS) + __SIZE__ ) +#define TRIGGER_VALUE(__TYPE__,__OFFSET__,__SIZE__) (((((__TYPE__) << MSCCLPP_BITS_OFFSET) + (__OFFSET__)) << MSCCLPP_BITS_SIZE) + __SIZE__ ) -// the summation of number of bits must be 64 or less -union alignas(8) mscclppTrigger { - uint64_t value; +// the summation of number of bits must be 128 or less +union alignas(16) mscclppTrigger { + uint64_t value[2]; struct { - uint64_t dataSize : MSCCLPP_SIZE_BITS; - uint64_t dataOffset : MSCCLPP_OFFSET_BITS; + // high 64 bits: value[0] + uint64_t dataSize : MSCCLPP_BITS_SIZE; + uint64_t dataOffset : MSCCLPP_BITS_OFFSET; uint64_t type : 3; + uint64_t : 0; // ensure 64-bit alignment + // low 64 bits: value[1] + uint64_t connId : MSCCLPP_BITS_CONNID; } fields; }; diff --git a/src/proxy.cc b/src/proxy.cc index aaf6d6c5..f7a772b4 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -68,8 +68,13 @@ void* mscclppProxyServiceP2P(void* _args) { while (*run == MSCCLPP_PROXY_RUN_STATE_RUNNING) { for (struct mscclppConn *conn : conns) { // Poll to see if we are ready to send anything - trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]); - if (trigger.value == 0) continue; + trigger.value[0] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value); + if (trigger.value[0] == 0) continue; + // TODO(chhwang): latency overhead of reading value[1] is too large (~9us) + trigger.value[1] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value + 1); + if (trigger.value[1] != 42) { + WARN("Unexpected value"); + } // Iterate over what send is needed if (trigger.fields.type & mscclppData){ @@ -85,7 +90,7 @@ void* mscclppProxyServiceP2P(void* _args) { PROXYCUDACHECK(cudaStreamSynchronize(stream)); } - // Send completion + // Send completion: reset only the high 64 bits volatile uint64_t *tmp = (volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]); *tmp = 0; conn->fifoTail++; @@ -196,8 +201,13 @@ void* mscclppProxyServiceIb(void* _args) { } #else // (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 1) // Poll to see if we are ready to send anything - trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]); - if (trigger.value == 0) continue; + trigger.value[0] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value); + if (trigger.value[0] == 0) continue; + // TODO(chhwang): latency overhead of reading value[1] is too large (~9us) + trigger.value[1] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value + 1); + if (trigger.value[1] != 42) { + WARN("Unexpected value"); + } if (trigger.fields.type & mscclppData) { conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)trigger.fields.dataSize, @@ -240,7 +250,7 @@ void* mscclppProxyServiceIb(void* _args) { } } - // Send completion + // Send completion: reset only the high 64 bits volatile uint64_t *tmp = (volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]); *tmp = 0; conn->fifoTail++; diff --git a/tests/p2p_test.cu b/tests/p2p_test.cu index f90deb44..469b17bb 100644 --- a/tests/p2p_test.cu +++ b/tests/p2p_test.cu @@ -42,6 +42,17 @@ static double getTime(void) __constant__ mscclppDevConn_t constDevConns[16]; +__forceinline__ __device__ void setTrigger(mscclppTrigger *trig, uint64_t connId, uint64_t type, + uint64_t dataOffset, uint64_t dataSize) +{ + asm volatile( + "st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(&trig->value), + "l"((type << (MSCCLPP_BITS_SIZE + MSCCLPP_BITS_OFFSET)) + + (dataOffset << (MSCCLPP_BITS_SIZE)) + + (dataSize)), + "l"(connId)); +} + __global__ void kernel(int rank, int world_size) { if (threadIdx.x % 32 != 0) return; @@ -54,7 +65,7 @@ __global__ void kernel(int rank, int world_size) volatile uint64_t *remoteFlag = devConn.remoteFlag; volatile uint64_t *proxyFlag = devConn.proxyFlag; int curFifoHead = *devConn.triggerFifoHead; - volatile uint64_t *trig = (volatile uint64_t *)&devConn.trigger[curFifoHead]; + mscclppTrigger *trig = &devConn.trigger[curFifoHead]; curFifoHead += 1; if (curFifoHead == MSCCLPP_PROXY_FIFO_SIZE) curFifoHead = 0; @@ -78,12 +89,11 @@ __global__ void kernel(int rank, int world_size) #if (USE_DMA_FOR_P2P == 1) // Wait until the proxy have sent my data and flag - while (*trig != 0) {} + // Check only the high 64 bits + while (*(volatile uint64_t *)trig->value != 0) {} // Trigger sending data and flag - uint64_t dataOffset = rank * sizeof(int); - uint64_t dataSize = sizeof(int); - *trig = TRIGGER_VALUE(mscclppFlag | mscclppData, dataOffset, dataSize); + setTrigger(trig, /*for test*/42, mscclppFlag | mscclppData, rank * sizeof(int), sizeof(int)); // Wait for receiving data from remote rank while (*proxyFlag == baseFlag) {}