mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-14 02:03:03 +00:00
128-bit trigger
This commit is contained in:
@@ -22,18 +22,23 @@ typedef enum : uint64_t { mscclppData = 0x1,
|
||||
mscclppFlag = 0x2,
|
||||
mscclppSync = 0x4} mscclppTriggerType_t;
|
||||
|
||||
#define MSCCLPP_SIZE_BITS 30
|
||||
#define MSCCLPP_OFFSET_BITS 31
|
||||
#define MSCCLPP_BITS_SIZE 30
|
||||
#define MSCCLPP_BITS_OFFSET 31
|
||||
#define MSCCLPP_BITS_CONNID 10
|
||||
|
||||
#define TRIGGER_VALUE(__TYPE__,__OFFSET__,__SIZE__) (((((__TYPE__) << MSCCLPP_OFFSET_BITS) + (__OFFSET__)) << MSCCLPP_SIZE_BITS) + __SIZE__ )
|
||||
#define TRIGGER_VALUE(__TYPE__,__OFFSET__,__SIZE__) (((((__TYPE__) << MSCCLPP_BITS_OFFSET) + (__OFFSET__)) << MSCCLPP_BITS_SIZE) + __SIZE__ )
|
||||
|
||||
// the summation of number of bits must be 64 or less
|
||||
union alignas(8) mscclppTrigger {
|
||||
uint64_t value;
|
||||
// the summation of number of bits must be 128 or less
|
||||
union alignas(16) mscclppTrigger {
|
||||
uint64_t value[2];
|
||||
struct {
|
||||
uint64_t dataSize : MSCCLPP_SIZE_BITS;
|
||||
uint64_t dataOffset : MSCCLPP_OFFSET_BITS;
|
||||
// high 64 bits: value[0]
|
||||
uint64_t dataSize : MSCCLPP_BITS_SIZE;
|
||||
uint64_t dataOffset : MSCCLPP_BITS_OFFSET;
|
||||
uint64_t type : 3;
|
||||
uint64_t : 0; // ensure 64-bit alignment
|
||||
// low 64 bits: value[1]
|
||||
uint64_t connId : MSCCLPP_BITS_CONNID;
|
||||
} fields;
|
||||
};
|
||||
|
||||
|
||||
22
src/proxy.cc
22
src/proxy.cc
@@ -68,8 +68,13 @@ void* mscclppProxyServiceP2P(void* _args) {
|
||||
while (*run == MSCCLPP_PROXY_RUN_STATE_RUNNING) {
|
||||
for (struct mscclppConn *conn : conns) {
|
||||
// Poll to see if we are ready to send anything
|
||||
trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]);
|
||||
if (trigger.value == 0) continue;
|
||||
trigger.value[0] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value);
|
||||
if (trigger.value[0] == 0) continue;
|
||||
// TODO(chhwang): latency overhead of reading value[1] is too large (~9us)
|
||||
trigger.value[1] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value + 1);
|
||||
if (trigger.value[1] != 42) {
|
||||
WARN("Unexpected value");
|
||||
}
|
||||
|
||||
// Iterate over what send is needed
|
||||
if (trigger.fields.type & mscclppData){
|
||||
@@ -85,7 +90,7 @@ void* mscclppProxyServiceP2P(void* _args) {
|
||||
PROXYCUDACHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
// Send completion
|
||||
// Send completion: reset only the high 64 bits
|
||||
volatile uint64_t *tmp = (volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]);
|
||||
*tmp = 0;
|
||||
conn->fifoTail++;
|
||||
@@ -196,8 +201,13 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
}
|
||||
#else // (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 1)
|
||||
// Poll to see if we are ready to send anything
|
||||
trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]);
|
||||
if (trigger.value == 0) continue;
|
||||
trigger.value[0] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value);
|
||||
if (trigger.value[0] == 0) continue;
|
||||
// TODO(chhwang): latency overhead of reading value[1] is too large (~9us)
|
||||
trigger.value[1] = *(volatile uint64_t *)(conn->cpuTriggerFifo[conn->fifoTail].value + 1);
|
||||
if (trigger.value[1] != 42) {
|
||||
WARN("Unexpected value");
|
||||
}
|
||||
|
||||
if (trigger.fields.type & mscclppData) {
|
||||
conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)trigger.fields.dataSize,
|
||||
@@ -240,7 +250,7 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
}
|
||||
}
|
||||
|
||||
// Send completion
|
||||
// Send completion: reset only the high 64 bits
|
||||
volatile uint64_t *tmp = (volatile uint64_t *)(&conn->cpuTriggerFifo[conn->fifoTail]);
|
||||
*tmp = 0;
|
||||
conn->fifoTail++;
|
||||
|
||||
@@ -42,6 +42,17 @@ static double getTime(void)
|
||||
|
||||
__constant__ mscclppDevConn_t constDevConns[16];
|
||||
|
||||
__forceinline__ __device__ void setTrigger(mscclppTrigger *trig, uint64_t connId, uint64_t type,
|
||||
uint64_t dataOffset, uint64_t dataSize)
|
||||
{
|
||||
asm volatile(
|
||||
"st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(&trig->value),
|
||||
"l"((type << (MSCCLPP_BITS_SIZE + MSCCLPP_BITS_OFFSET)) +
|
||||
(dataOffset << (MSCCLPP_BITS_SIZE)) +
|
||||
(dataSize)),
|
||||
"l"(connId));
|
||||
}
|
||||
|
||||
__global__ void kernel(int rank, int world_size)
|
||||
{
|
||||
if (threadIdx.x % 32 != 0) return;
|
||||
@@ -54,7 +65,7 @@ __global__ void kernel(int rank, int world_size)
|
||||
volatile uint64_t *remoteFlag = devConn.remoteFlag;
|
||||
volatile uint64_t *proxyFlag = devConn.proxyFlag;
|
||||
int curFifoHead = *devConn.triggerFifoHead;
|
||||
volatile uint64_t *trig = (volatile uint64_t *)&devConn.trigger[curFifoHead];
|
||||
mscclppTrigger *trig = &devConn.trigger[curFifoHead];
|
||||
curFifoHead += 1;
|
||||
if (curFifoHead == MSCCLPP_PROXY_FIFO_SIZE)
|
||||
curFifoHead = 0;
|
||||
@@ -78,12 +89,11 @@ __global__ void kernel(int rank, int world_size)
|
||||
#if (USE_DMA_FOR_P2P == 1)
|
||||
|
||||
// Wait until the proxy have sent my data and flag
|
||||
while (*trig != 0) {}
|
||||
// Check only the high 64 bits
|
||||
while (*(volatile uint64_t *)trig->value != 0) {}
|
||||
|
||||
// Trigger sending data and flag
|
||||
uint64_t dataOffset = rank * sizeof(int);
|
||||
uint64_t dataSize = sizeof(int);
|
||||
*trig = TRIGGER_VALUE(mscclppFlag | mscclppData, dataOffset, dataSize);
|
||||
setTrigger(trig, /*for test*/42, mscclppFlag | mscclppData, rank * sizeof(int), sizeof(int));
|
||||
|
||||
// Wait for receiving data from remote rank
|
||||
while (*proxyFlag == baseFlag) {}
|
||||
|
||||
Reference in New Issue
Block a user