mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-03-26 10:07:52 +00:00
* Moved the `MemoryChannel::copy()` method out of the `MemoryChannel` as a standalone function. * Renamed `mscclpp::putPackets()` and `mscclpp::getPackets()` to `mscclpp::copyToPackets()` and `mscclpp::copyFromPackets()` respectively for consistency. * Renamed `MemoryChannel::getPackets()` to `MemoryChannel::unpackPackets()` for clarity. Renamed `getPacketBuffer` to `packetBuffer`. * Added the `MemoryChannel::unpackPacket()` method that unpacks one packet in the buffer. * Added the `BaseMemoryChannel` class that only contains a semaphore without memory addresses. * Removed the `MemoryDevice2DeviceSemaphoreDeviceHandle::signalPacket()` method that is lacking use cases.
30 lines
1.1 KiB
Plaintext
30 lines
1.1 KiB
Plaintext
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT license.
|
|
|
|
#include <mscclpp/memory_channel_device.hpp>
|
|
|
|
// be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
|
|
extern "C" __global__ void __launch_bounds__(1024, 1)
|
|
memory_channel(mscclpp::MemoryChannelDeviceHandle* channels, int my_rank, int nranks, int num_elements,
|
|
int use_packet) {
|
|
int tid = threadIdx.x;
|
|
int bid = blockIdx.x;
|
|
uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks;
|
|
uint64_t my_offset = size_per_rank * my_rank;
|
|
uint64_t my_nghr_offset = size_per_rank * bid;
|
|
int flag = 123;
|
|
if (bid < nranks && bid != my_rank) {
|
|
if (use_packet) {
|
|
channels[bid].putPackets(2 * my_offset, my_offset, size_per_rank, tid, blockDim.x, flag);
|
|
channels[bid].unpackPackets(2 * my_nghr_offset, my_nghr_offset, size_per_rank, tid, blockDim.x, flag);
|
|
} else {
|
|
channels[bid].put(my_offset, my_offset, size_per_rank, tid, blockDim.x);
|
|
__syncthreads();
|
|
if (!use_packet && tid == 0) {
|
|
channels[bid].signal();
|
|
channels[bid].wait();
|
|
}
|
|
}
|
|
}
|
|
}
|