128-bit trigger

This commit is contained in:
Changho Hwang
2023-03-10 10:49:36 +00:00
parent 85d92961a3
commit 1be76d128d
3 changed files with 44 additions and 19 deletions

View File

@@ -22,18 +22,23 @@ typedef enum : uint64_t { mscclppData = 0x1,
mscclppFlag = 0x2,
mscclppSync = 0x4} mscclppTriggerType_t;
#define MSCCLPP_SIZE_BITS 30
#define MSCCLPP_OFFSET_BITS 31
#define MSCCLPP_BITS_SIZE 30
#define MSCCLPP_BITS_OFFSET 31
#define MSCCLPP_BITS_CONNID 10
#define TRIGGER_VALUE(__TYPE__,__OFFSET__,__SIZE__) (((((__TYPE__) << MSCCLPP_OFFSET_BITS) + (__OFFSET__)) << MSCCLPP_SIZE_BITS) + __SIZE__ )
#define TRIGGER_VALUE(__TYPE__,__OFFSET__,__SIZE__) (((((__TYPE__) << MSCCLPP_BITS_OFFSET) + (__OFFSET__)) << MSCCLPP_BITS_SIZE) + __SIZE__ )
// the summation of number of bits must be 64 or less
union alignas(8) mscclppTrigger {
uint64_t value;
// the summation of number of bits must be 128 or less
union alignas(16) mscclppTrigger {
uint64_t value[2];
struct {
uint64_t dataSize : MSCCLPP_SIZE_BITS;
uint64_t dataOffset : MSCCLPP_OFFSET_BITS;
// high 64 bits: value[0]
uint64_t dataSize : MSCCLPP_BITS_SIZE;
uint64_t dataOffset : MSCCLPP_BITS_OFFSET;
uint64_t type : 3;
uint64_t : 0; // ensure 64-bit alignment
// low 64 bits: value[1]
uint64_t connId : MSCCLPP_BITS_CONNID;
} fields;
};

View File

@@ -68,8 +68,13 @@ void* mscclppProxyServiceP2P(void* _args) {
while (*run == MSCCLPP_PROXY_RUN_STATE_RUNNING) {
for (struct mscclppConn *conn : conns) {
// Poll to see if we are ready to send anything
trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]);
if (trigger.value == 0) continue;
trigger.value[0] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value);
if (trigger.value[0] == 0) continue;
// TODO(chhwang): latency overhead of reading value[1] is too large (~9us)
trigger.value[1] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value + 1);
if (trigger.value[1] != 42) {
WARN("Unexpected value");
}
// Iterate over what send is needed
if (trigger.fields.type & mscclppData){
@@ -85,7 +90,7 @@ void* mscclppProxyServiceP2P(void* _args) {
PROXYCUDACHECK(cudaStreamSynchronize(stream));
}
// Send completion
// Send completion: reset only the high 64 bits
volatile uint64_t *tmp = (volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]);
*tmp = 0;
conn->fifoTail++;
@@ -196,8 +201,13 @@ void* mscclppProxyServiceIb(void* _args) {
}
#else // (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 1)
// Poll to see if we are ready to send anything
trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]);
if (trigger.value == 0) continue;
trigger.value[0] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value);
if (trigger.value[0] == 0) continue;
// TODO(chhwang): latency overhead of reading value[1] is too large (~9us)
trigger.value[1] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value + 1);
if (trigger.value[1] != 42) {
WARN("Unexpected value");
}
if (trigger.fields.type & mscclppData) {
conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)trigger.fields.dataSize,
@@ -240,7 +250,7 @@ void* mscclppProxyServiceIb(void* _args) {
}
}
// Send completion
// Send completion: reset only the high 64 bits
volatile uint64_t *tmp = (volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]);
*tmp = 0;
conn->fifoTail++;

View File

@@ -42,6 +42,17 @@ static double getTime(void)
__constant__ mscclppDevConn_t constDevConns[16];
__forceinline__ __device__ void setTrigger(mscclppTrigger *trig, uint64_t connId, uint64_t type,
uint64_t dataOffset, uint64_t dataSize)
{
asm volatile(
"st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(&trig->value),
"l"((type << (MSCCLPP_BITS_SIZE + MSCCLPP_BITS_OFFSET)) +
(dataOffset << (MSCCLPP_BITS_SIZE)) +
(dataSize)),
"l"(connId));
}
__global__ void kernel(int rank, int world_size)
{
if (threadIdx.x % 32 != 0) return;
@@ -54,7 +65,7 @@ __global__ void kernel(int rank, int world_size)
volatile uint64_t *remoteFlag = devConn.remoteFlag;
volatile uint64_t *proxyFlag = devConn.proxyFlag;
int curFifoHead = *devConn.triggerFifoHead;
volatile uint64_t *trig = (volatile uint64_t *)&devConn.trigger[curFifoHead];
mscclppTrigger *trig = &devConn.trigger[curFifoHead];
curFifoHead += 1;
if (curFifoHead == MSCCLPP_PROXY_FIFO_SIZE)
curFifoHead = 0;
@@ -78,12 +89,11 @@ __global__ void kernel(int rank, int world_size)
#if (USE_DMA_FOR_P2P == 1)
// Wait until the proxy have sent my data and flag
while (*trig != 0) {}
// Check only the high 64 bits
while (*(volatile uint64_t *)trig->value != 0) {}
// Trigger sending data and flag
uint64_t dataOffset = rank * sizeof(int);
uint64_t dataSize = sizeof(int);
*trig = TRIGGER_VALUE(mscclppFlag | mscclppData, dataOffset, dataSize);
setTrigger(trig, /*for test*/42, mscclppFlag | mscclppData, rank * sizeof(int), sizeof(int));
// Wait for receiving data from remote rank
while (*proxyFlag == baseFlag) {}