mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-19 22:39:11 +00:00
Align interfaces of put/get/putPackets/getPackets (#185)
This commit is contained in:
37
.github/workflows/integration-test-backup.yml
vendored
37
.github/workflows/integration-test-backup.yml
vendored
@@ -5,17 +5,20 @@ on: workflow_dispatch
|
||||
jobs:
|
||||
IntegrationTest:
|
||||
runs-on: self-hosted
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
container-image: [ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8, ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1]
|
||||
cuda: [ cuda11.8, cuda12.1 ]
|
||||
|
||||
container:
|
||||
image: ${{ matrix.container-image }}
|
||||
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
|
||||
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install CMake
|
||||
run: |
|
||||
@@ -39,35 +42,35 @@ jobs:
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
|
||||
- name: Run mscclpp SendRecv test
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
|
||||
- name: Run mscclpp AllReduce test
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
|
||||
|
||||
- name: Run mscclpp AllToAll test
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
|
||||
- name: Check collective primitives performance
|
||||
run: |
|
||||
|
||||
23
.github/workflows/ut-backup.yml
vendored
23
.github/workflows/ut-backup.yml
vendored
@@ -5,18 +5,21 @@ on: workflow_dispatch
|
||||
jobs:
|
||||
UnitTest:
|
||||
runs-on: self-hosted
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
matrix:
|
||||
container-image: [ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8, ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1]
|
||||
cuda: [ cuda11.8, cuda12.1 ]
|
||||
|
||||
container:
|
||||
image: ${{ matrix.container-image }}
|
||||
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
|
||||
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
@@ -44,9 +47,9 @@ jobs:
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun -tag-output -np 2 ./build/test/mp_unit_tests
|
||||
mpirun -tag-output -np 4 ./build/test/mp_unit_tests
|
||||
mpirun -tag-output -np 8 ./build/test/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
- name: PyTests
|
||||
@@ -54,10 +57,10 @@ jobs:
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
cd build && make pylib-copy
|
||||
if [[ '${{ matrix.container-image }}' == *'cuda11'* ]]; then
|
||||
pip3 install -r ../python/test/requirements_cu11.txt
|
||||
if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
|
||||
python3 -m pip install -r ../python/test/requirements_cu11.txt
|
||||
else
|
||||
pip3 install -r ../python/test/requirements_cu12.txt
|
||||
python3 -m pip install -r ../python/test/requirements_cu12.txt
|
||||
fi
|
||||
mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
|
||||
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
@@ -48,7 +48,7 @@ union LLPacket {
|
||||
/// @param flag The flag to read.
|
||||
/// @param data The 8-byte data read.
|
||||
/// @return True if the flag is not equal to the given flag.
|
||||
__forceinline__ __device__ bool readOnce(uint32_t flag, uint2& data) {
|
||||
__forceinline__ __device__ bool readOnce(uint32_t flag, uint2& data) const {
|
||||
uint32_t flag1, flag2;
|
||||
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(data.x), "=r"(flag1), "=r"(data.y), "=r"(flag2)
|
||||
@@ -60,7 +60,7 @@ union LLPacket {
|
||||
/// @param flag The flag to read.
|
||||
/// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
|
||||
/// @return The 8-byte data read.
|
||||
__forceinline__ __device__ uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) {
|
||||
__forceinline__ __device__ uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) const {
|
||||
uint2 data;
|
||||
POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount);
|
||||
return data;
|
||||
@@ -75,28 +75,31 @@ union LLPacket {
|
||||
};
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__forceinline__ __device__ void putPackets(void* dst, uint64_t dstOffset, void* src, uint64_t srcOffset,
|
||||
uint64_t srcBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
|
||||
/// Read from the origin and write to the target buffer.
|
||||
__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr,
|
||||
uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
|
||||
uint32_t numThreads, uint32_t flag) {
|
||||
// Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
|
||||
uint32_t* srcBase = (uint32_t*)((char*)src + srcOffset);
|
||||
LLPacket* dstBase = (LLPacket*)((char*)dst + dstOffset);
|
||||
size_t nElem = srcBytes / sizeof(uint64_t);
|
||||
const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
|
||||
LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset);
|
||||
size_t nElem = originBytes / sizeof(uint64_t);
|
||||
for (size_t i = threadId; i < nElem; i += numThreads) {
|
||||
LLPacket* pkt = &dstBase[i];
|
||||
pkt->write(srcBase[2 * i], srcBase[2 * i + 1], flag);
|
||||
LLPacket* pkt = &targetBase[i];
|
||||
pkt->write(originBase[2 * i], originBase[2 * i + 1], flag);
|
||||
}
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void getPackets(void* dst, uint64_t dstOffset, void* src, uint64_t srcOffset,
|
||||
uint64_t dstBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
|
||||
/// Read from the target buffer and write to the origin.
|
||||
__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr,
|
||||
uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
|
||||
uint32_t numThreads, uint32_t flag) {
|
||||
// Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
|
||||
// TODO(saemal): this is not matching sm_channel get method.
|
||||
LLPacket* srcBase = (LLPacket*)((char*)src + srcOffset);
|
||||
uint2* dstBase = (uint2*)((char*)dst + dstOffset);
|
||||
size_t nElem = dstBytes / sizeof(uint2);
|
||||
const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset);
|
||||
uint2* originBase = (uint2*)((char*)originPtr + originOffset);
|
||||
size_t nElem = originBytes / sizeof(uint2);
|
||||
for (size_t i = threadId; i < nElem; i += numThreads) {
|
||||
LLPacket* pkt = &srcBase[i];
|
||||
dstBase[i] = pkt->read(flag);
|
||||
const LLPacket* pkt = &targetBase[i];
|
||||
originBase[i] = pkt->read(flag);
|
||||
}
|
||||
}
|
||||
#endif // __CUDACC__
|
||||
|
||||
@@ -196,48 +196,50 @@ struct SmChannelDeviceHandle {
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy data from the local memory to the remote memory.
|
||||
/// Copy data from the local memory (origin) to the remote memory (target).
|
||||
///
|
||||
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
|
||||
///
|
||||
/// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16.
|
||||
/// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p
|
||||
/// Alignment.
|
||||
/// @param dstOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
|
||||
/// @param srcOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
|
||||
/// @param bytes Bytes of the data to be copied. Should be a multiple of @p Alignment.
|
||||
/// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
|
||||
/// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
|
||||
/// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment.
|
||||
/// @param threadId The index of the current thread among all threads running this function. This is different from
|
||||
/// the `threadIdx` in CUDA.
|
||||
/// @param numThreads The total number of threads that run this function.
|
||||
///
|
||||
template <int Alignment = 16, bool CopyRemainder = true>
|
||||
__forceinline__ __device__ void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t bytes, uint32_t threadId,
|
||||
uint32_t numThreads) {
|
||||
copy<Alignment, CopyRemainder>((char*)dst_ + dstOffset, (char*)src_ + srcOffset, bytes, threadId, numThreads);
|
||||
__forceinline__ __device__ void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
|
||||
uint32_t threadId, uint32_t numThreads) {
|
||||
copy<Alignment, CopyRemainder>((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId,
|
||||
numThreads);
|
||||
}
|
||||
|
||||
/// Copy data from the remote memory to the local memory.
|
||||
/// Copy data from the remote memory (target) to the local memory (origin).
|
||||
///
|
||||
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
|
||||
///
|
||||
/// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16.
|
||||
/// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p
|
||||
/// Alignment.
|
||||
/// @param dstOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
|
||||
/// @param srcOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
|
||||
/// @param bytes Bytes of the data to be copied. Should be a multiple of @p Alignment.
|
||||
/// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
|
||||
/// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
|
||||
/// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment.
|
||||
/// @param threadId The index of the current thread among all threads running this function. This is different from
|
||||
/// the `threadIdx` in CUDA.
|
||||
/// @param numThreads The total number of threads that run this function.
|
||||
///
|
||||
template <int Alignment = 16, bool CopyRemainder = true>
|
||||
__forceinline__ __device__ void get(uint64_t dstOffset, uint64_t srcOffset, uint64_t bytes, uint32_t threadId,
|
||||
uint32_t numThreads) {
|
||||
__forceinline__ __device__ void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
|
||||
uint32_t threadId, uint32_t numThreads) {
|
||||
// Note that `dst` and `src` are swapped for `get()`.
|
||||
copy<Alignment, CopyRemainder>((char*)src_ + srcOffset, (char*)dst_ + dstOffset, bytes, threadId, numThreads);
|
||||
copy<Alignment, CopyRemainder>((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId,
|
||||
numThreads);
|
||||
}
|
||||
|
||||
/// Copy data from the local memory to the remote memory.
|
||||
/// Copy data from the local memory (origin) to the remote memory (target).
|
||||
///
|
||||
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
|
||||
///
|
||||
@@ -251,11 +253,11 @@ struct SmChannelDeviceHandle {
|
||||
/// @param numThreads The total number of threads that run this function.
|
||||
///
|
||||
template <int Alignment = 16, bool CopyRemainder = true>
|
||||
__forceinline__ __device__ void put(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) {
|
||||
put<Alignment, CopyRemainder>(offset, offset, size, threadId, numThreads);
|
||||
__forceinline__ __device__ void put(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
|
||||
put<Alignment, CopyRemainder>(offset, offset, bytes, threadId, numThreads);
|
||||
}
|
||||
|
||||
/// Copy data from the remote memory to the local memory.
|
||||
/// Copy data from the remote memory (target) to the local memory (origin).
|
||||
///
|
||||
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
|
||||
///
|
||||
@@ -269,40 +271,41 @@ struct SmChannelDeviceHandle {
|
||||
/// @param numThreads The total number of threads that run this function.
|
||||
///
|
||||
template <int Alignment = 16, bool CopyRemainder = true>
|
||||
__forceinline__ __device__ void get(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) {
|
||||
get<Alignment, CopyRemainder>(offset, offset, size, threadId, numThreads);
|
||||
__forceinline__ __device__ void get(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
|
||||
get<Alignment, CopyRemainder>(offset, offset, bytes, threadId, numThreads);
|
||||
}
|
||||
|
||||
/// Construct @ref LLPacket from the data in the local memory and write it on the remote memory.
|
||||
/// Construct @ref LLPacket from the data in the local memory (origin) and write it on the remote packet buffer
|
||||
/// (target).
|
||||
///
|
||||
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of packets.
|
||||
///
|
||||
/// @param dstOffset The offset in bytes of the remote address.
|
||||
/// @param srcOffset The offset in bytes of the local address.
|
||||
/// @param bytes Bytes of the data to be copied.
|
||||
/// @param targetOffset The offset in bytes of the remote packet buffer.
|
||||
/// @param originOffset The offset in bytes of the local data.
|
||||
/// @param originBytes Bytes of the origin to be copied.
|
||||
/// @param threadId The index of the current thread among all threads running this function. This is different from
|
||||
/// the `threadIdx` in CUDA.
|
||||
/// @param numThreads The total number of threads that run this function.
|
||||
///
|
||||
__forceinline__ __device__ void putPackets(uint64_t dstOffset, uint64_t srcOffset, uint64_t bytes, uint32_t threadId,
|
||||
uint32_t numThreads, uint32_t flag) {
|
||||
mscclpp::putPackets(dst_, dstOffset, src_, srcOffset, bytes, threadId, numThreads, flag);
|
||||
__forceinline__ __device__ void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
|
||||
uint32_t threadId, uint32_t numThreads, uint32_t flag) {
|
||||
mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
|
||||
}
|
||||
|
||||
/// Retrieve data from @ref LLPacket in the local packet buffer and write it on the local memory.
|
||||
/// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin).
|
||||
///
|
||||
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
|
||||
///
|
||||
/// @param dstOffset The offset in bytes of the local memory.
|
||||
/// @param srcOffset The offset in bytes of the local packet buffer.
|
||||
/// @param bytes Bytes of the data to be copied.
|
||||
/// @param targetOffset The offset in bytes of the local packet buffer.
|
||||
/// @param originOffset The offset in bytes of the local data.
|
||||
/// @param originBytes Bytes of the origin to be copied.
|
||||
/// @param threadId The index of the current thread among all threads running this function. This is different from
|
||||
/// the `threadIdx` in CUDA.
|
||||
/// @param numThreads The total number of threads that run this function.
|
||||
///
|
||||
__forceinline__ __device__ void getPackets(uint64_t dstOffset, uint64_t srcOffset, uint64_t bytes, uint32_t threadId,
|
||||
uint32_t numThreads, uint32_t flag) {
|
||||
mscclpp::getPackets(src_, dstOffset, getPacketBuffer_, srcOffset, bytes, threadId, numThreads, flag);
|
||||
__forceinline__ __device__ void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
|
||||
uint32_t threadId, uint32_t numThreads, uint32_t flag) {
|
||||
mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
|
||||
}
|
||||
|
||||
/// Signal the remote semaphore.
|
||||
|
||||
@@ -26,7 +26,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
|
||||
channels[tid].put(2 * my_offset, 2 * my_offset, 2 * size_per_rank);
|
||||
}
|
||||
if (my_nghr != my_rank && my_nghr < nranks)
|
||||
mscclpp::getPackets(data, my_nghr_offset, scratch, 2 * my_nghr_offset, size_per_rank, tid % nthreads_per_rank,
|
||||
mscclpp::getPackets(scratch, 2 * my_nghr_offset, data, my_nghr_offset, size_per_rank, tid % nthreads_per_rank,
|
||||
nthreads_per_rank, flag);
|
||||
} else {
|
||||
if (tid < nranks && tid != my_rank) {
|
||||
|
||||
@@ -17,7 +17,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
|
||||
if (bid < nranks && bid != my_rank) {
|
||||
if (use_packet) {
|
||||
channels[bid].putPackets(2 * my_offset, my_offset, size_per_rank, tid, blockDim.x, flag);
|
||||
channels[bid].getPackets(my_nghr_offset, 2 * my_nghr_offset, size_per_rank, tid, blockDim.x, flag);
|
||||
channels[bid].getPackets(2 * my_nghr_offset, my_nghr_offset, size_per_rank, tid, blockDim.x, flag);
|
||||
} else {
|
||||
channels[bid].put(my_offset, my_offset, size_per_rank, tid, blockDim.x);
|
||||
__syncthreads();
|
||||
|
||||
@@ -238,7 +238,7 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m
|
||||
flusher = 0;
|
||||
}
|
||||
} else {
|
||||
mscclpp::getPackets(buff, 0, getPktBuf, 0, nElem * sizeof(int), threadId, numThreads, flag);
|
||||
mscclpp::getPackets(getPktBuf, 0, buff, 0, nElem * sizeof(int), threadId, numThreads, flag);
|
||||
if (CheckCorrectness) {
|
||||
// If each thread reads 8 bytes at once, we don't need a barrier after getPackets().
|
||||
// __syncthreads();
|
||||
|
||||
Reference in New Issue
Block a user