From e2ee8d80b93f0a7d9ee6ae2d8f79efbb268ab19c Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 21 Mar 2023 06:26:12 +0000 Subject: [PATCH] perf fix for multi-node allgather --- tests/allgather_test.cu | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/allgather_test.cu b/tests/allgather_test.cu index 06ec90c3..fc9e921c 100644 --- a/tests/allgather_test.cu +++ b/tests/allgather_test.cu @@ -59,18 +59,23 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU) } // Each warp receives data from different ranks -#if 0 +#if 1 // get a thread-local trigger and a request for waiting on it mscclppTrigger_t trig; mscclppRequest_t req = devConn.fifo.getTrigger(&trig); // Trigger sending data, flag and synchronize after - devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int)); + devConn.fifo.setTrigger(trig, mscclppData, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int)); + // we cannot reuse buffer and flag until the request is completed + + req = devConn.fifo.getTrigger(&trig); + + // Trigger sending data, flag and synchronize after + devConn.fifo.setTrigger(trig, mscclppFlag | mscclppSync, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int)); // we cannot reuse buffer and flag until the request is completed // Wait on the request to make sure it is safe to reuse buffer and flag devConn.fifo.waitTrigger(req); - // Wait for receiving data from remote rank while (*proxyFlag == baseFlag); #else