mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-21 21:39:21 +00:00
Fix putDirect (#62)
`ring_send_recv_test_perf` result (1 node 8 gpus):
```
# minBytes 1024 maxBytes 1024 step: 1048576(bytes) warmup iters: 10 iters: 100 validation: 1 graph: 1, kernel num: 0
#
# Using devices
# Rank 0 Pid 365596 on costsim-dev-00000A device 0 [0001:00:00.0] NVIDIA A100-SXM4-80GB
# Rank 1 Pid 365597 on costsim-dev-00000A device 1 [0002:00:00.0] NVIDIA A100-SXM4-80GB
# Rank 2 Pid 365598 on costsim-dev-00000A device 2 [0003:00:00.0] NVIDIA A100-SXM4-80GB
# Rank 3 Pid 365599 on costsim-dev-00000A device 3 [0004:00:00.0] NVIDIA A100-SXM4-80GB
# Rank 4 Pid 365600 on costsim-dev-00000A device 4 [000B:00:00.0] NVIDIA A100-SXM4-80GB
# Rank 5 Pid 365602 on costsim-dev-00000A device 5 [000C:00:00.0] NVIDIA A100-SXM4-80GB
# Rank 6 Pid 365603 on costsim-dev-00000A device 6 [000D:00:00.0] NVIDIA A100-SXM4-80GB
# Rank 7 Pid 365605 on costsim-dev-00000A device 7 [000E:00:00.0] NVIDIA A100-SXM4-80GB
#
# Initializing MSCCL++
# Setting up the connection in MSCCL++
# Launching MSCCL++ proxy threads
#
# in-place out-of-place
# size count time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1024 256 31.70 0.26 0.23 0
Stopping MSCCL++ proxy threads
# Out of bounds values : 0 OK
#
```
This commit is contained in:
@@ -139,8 +139,8 @@ struct mscclppDevConn
|
||||
__forceinline__ __device__ void putDirect(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize,
|
||||
uint32_t threadId, uint32_t numThreads)
|
||||
{
|
||||
uint64_t* src = (uint64_t*)localBuff + srcDataOffset;
|
||||
uint64_t* dst = (uint64_t*)remoteBuff + dstDataOffset;
|
||||
uint64_t* src = (uint64_t*)((char*)localBuff + srcDataOffset);
|
||||
uint64_t* dst = (uint64_t*)((char*)remoteBuff + dstDataOffset);
|
||||
// assume the memory is aligned to 8 bytes
|
||||
size_t nElem =
|
||||
dataSize % sizeof(uint64_t) ? (dataSize + sizeof(uint64_t)) / sizeof(uint64_t) : dataSize / sizeof(uint64_t);
|
||||
|
||||
@@ -224,6 +224,11 @@ testResult_t CheckData(struct testArgs* args, int in_place, int64_t* wrongElts)
|
||||
CUDACHECK(cudaMemcpy(dataHostRecv, args->recvbuff, args->expectedBytes, cudaMemcpyDeviceToHost));
|
||||
CUDACHECK(cudaMemcpy(dataHostExpected, args->expected, args->expectedBytes, cudaMemcpyDeviceToHost));
|
||||
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
if (dataHostRecv[i] != dataHostExpected[i]) {
|
||||
*wrongElts += 1;
|
||||
}
|
||||
}
|
||||
if (args->reportErrors && *wrongElts) {
|
||||
(args->error)++;
|
||||
}
|
||||
|
||||
@@ -129,6 +129,6 @@ testResult_t RingSendRecvRunTest(struct testArgs* args)
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testEngine ringSendRecvTestEngine = {RingSendRecvGetBuffSize, RingSendRecvRunTest};
|
||||
struct testEngine ringSendRecvTestEngine = {RingSendRecvGetBuffSize, RingSendRecvRunTest, nullptr, nullptr};
|
||||
|
||||
#pragma weak mscclppTestEngine = ringSendRecvTestEngine
|
||||
|
||||
Reference in New Issue
Block a user