From c3cb81a90660701deb38e5c76abb2cc4c3f6fa47 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 26 Apr 2023 18:19:13 +0800 Subject: [PATCH] Fix putDirect (#62) `ring_send_recv_test_perf` result (1 node 8 gpus): ``` # minBytes 1024 maxBytes 1024 step: 1048576(bytes) warmup iters: 10 iters: 100 validation: 1 graph: 1, kernel num: 0 # # Using devices # Rank 0 Pid 365596 on costsim-dev-00000A device 0 [0001:00:00.0] NVIDIA A100-SXM4-80GB # Rank 1 Pid 365597 on costsim-dev-00000A device 1 [0002:00:00.0] NVIDIA A100-SXM4-80GB # Rank 2 Pid 365598 on costsim-dev-00000A device 2 [0003:00:00.0] NVIDIA A100-SXM4-80GB # Rank 3 Pid 365599 on costsim-dev-00000A device 3 [0004:00:00.0] NVIDIA A100-SXM4-80GB # Rank 4 Pid 365600 on costsim-dev-00000A device 4 [000B:00:00.0] NVIDIA A100-SXM4-80GB # Rank 5 Pid 365602 on costsim-dev-00000A device 5 [000C:00:00.0] NVIDIA A100-SXM4-80GB # Rank 6 Pid 365603 on costsim-dev-00000A device 6 [000D:00:00.0] NVIDIA A100-SXM4-80GB # Rank 7 Pid 365605 on costsim-dev-00000A device 7 [000E:00:00.0] NVIDIA A100-SXM4-80GB # # Initializing MSCCL++ # Setting up the connection in MSCCL++ # Launching MSCCL++ proxy threads # # in-place out-of-place # size count time algbw busbw #wrong time algbw busbw #wrong # (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) 1024 256 31.70 0.26 0.23 0 Stopping MSCCL++ proxy threads # Out of bounds values : 0 OK # ``` --- src/include/mscclpp.h | 4 ++-- tests/common.cu | 5 +++++ tests/ring_send_recv_test.cu | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 7ff606d6..48544911 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -139,8 +139,8 @@ struct mscclppDevConn __forceinline__ __device__ void putDirect(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize, uint32_t threadId, uint32_t numThreads) { - uint64_t* src = (uint64_t*)localBuff + srcDataOffset; - uint64_t* dst = (uint64_t*)remoteBuff + dstDataOffset; + uint64_t* src = (uint64_t*)((char*)localBuff + srcDataOffset); + uint64_t* dst = (uint64_t*)((char*)remoteBuff + dstDataOffset); // assume the memory is aligned to 8 bytes size_t nElem = dataSize % sizeof(uint64_t) ? (dataSize + sizeof(uint64_t)) / sizeof(uint64_t) : dataSize / sizeof(uint64_t); diff --git a/tests/common.cu b/tests/common.cu index 1d424452..c9cecadf 100644 --- a/tests/common.cu +++ b/tests/common.cu @@ -224,6 +224,11 @@ testResult_t CheckData(struct testArgs* args, int in_place, int64_t* wrongElts) CUDACHECK(cudaMemcpy(dataHostRecv, args->recvbuff, args->expectedBytes, cudaMemcpyDeviceToHost)); CUDACHECK(cudaMemcpy(dataHostExpected, args->expected, args->expectedBytes, cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < count; i++) { + if (dataHostRecv[i] != dataHostExpected[i]) { + *wrongElts += 1; + } + } if (args->reportErrors && *wrongElts) { (args->error)++; } diff --git a/tests/ring_send_recv_test.cu b/tests/ring_send_recv_test.cu index 886aeae9..86a4339f 100644 --- a/tests/ring_send_recv_test.cu +++ b/tests/ring_send_recv_test.cu @@ -129,6 +129,6 @@ testResult_t RingSendRecvRunTest(struct testArgs* args) return testSuccess; } -struct testEngine ringSendRecvTestEngine = {RingSendRecvGetBuffSize, RingSendRecvRunTest}; +struct testEngine ringSendRecvTestEngine = {RingSendRecvGetBuffSize, RingSendRecvRunTest, nullptr, nullptr}; #pragma weak mscclppTestEngine = ringSendRecvTestEngine