mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
214 lines
7.9 KiB
Plaintext
214 lines
7.9 KiB
Plaintext
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT license.
|
|
|
|
#include <mpi.h>
|
|
#include <stdio.h>
|
|
#include <sys/syscall.h>
|
|
#include <sys/types.h>
|
|
#include <unistd.h>
|
|
|
|
#include <mscclpp/gpu.hpp>
|
|
#if (CUDA_NVLS_API_AVAILABLE)
|
|
#include <cuda.h>
|
|
#include <cudaTypedefs.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#define CUCHECK(cmd) \
|
|
do { \
|
|
auto err = cmd; \
|
|
if (err != 0) { \
|
|
printf("Cuda failure %d: Line %d", err, __LINE__); \
|
|
exit(-1); \
|
|
} \
|
|
} while (false)
|
|
|
|
// AR kernel snippet for sm_90 only
|
|
|
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
|
|
#define MULTIMEM_ST(val, ptr) \
|
|
asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
|
|
"r"(val.w) \
|
|
: "memory");
|
|
// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
|
|
#define MULTIMEM_LD(val, ptr) \
|
|
asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \
|
|
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \
|
|
: "l"(ptr) \
|
|
: "memory");
|
|
#else
|
|
#define MULTIMEM_ST(val, ptr)
|
|
#define MULTIMEM_LD(val, ptr)
|
|
#endif
|
|
|
|
__global__ void init_kernel(float* uc_ptr, size_t size, int myrank, int nranks) {
|
|
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
|
|
uc_ptr[idx] = myrank + idx;
|
|
}
|
|
}
|
|
|
|
__global__ void check_correctness(float* uc_ptr, size_t size, int myrank, int nranks) {
|
|
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
|
|
float expected = (float)((nranks * (nranks - 1)) / 2 + nranks * idx);
|
|
if (abs(uc_ptr[idx] - expected) > 0.01 * expected) {
|
|
printf("error! idx %ld: %f != %f\n", idx, uc_ptr[idx], expected);
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void testing(float* mc_ptr, size_t size, int myrank, int nranks) {
|
|
// for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
|
|
// line is assumed to be 16B 4 ints of 8 halves
|
|
size_t my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks;
|
|
size_t my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks;
|
|
|
|
size_t my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
|
|
size_t my_step = blockDim.x * gridDim.x * 4;
|
|
|
|
for (size_t idx = my_st + my_offset; idx < my_en; idx += my_step) {
|
|
[[maybe_unused]] uint4 val;
|
|
MULTIMEM_LD(val, mc_ptr + idx);
|
|
MULTIMEM_ST(val, mc_ptr + idx);
|
|
}
|
|
}
|
|
|
|
int main() {
|
|
int myrank, nranks;
|
|
MPI_Init(NULL, NULL);
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
|
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
|
|
|
|
cudaSetDevice(myrank);
|
|
|
|
size_t size = 1024ULL * 1024ULL * 512ULL * 16;
|
|
CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
|
|
|
CUmulticastObjectProp mcProp = {};
|
|
mcProp.numDevices = nranks;
|
|
mcProp.size = size;
|
|
mcProp.handleTypes = handleType;
|
|
|
|
size_t minGran, gran;
|
|
gran = 0;
|
|
minGran = 0;
|
|
CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
|
|
CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
|
|
|
if (!myrank) printf("nvls multicast granularity: gran = %lu, minGrad = %lu\n", gran, minGran);
|
|
size_t mcSize = ((size + gran - 1) / gran) * gran;
|
|
mcProp.size = mcSize;
|
|
|
|
CUmemGenericAllocationHandle handle;
|
|
// only one rank creates the multicast object
|
|
if (!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp));
|
|
|
|
int fd = 0;
|
|
if (!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/));
|
|
|
|
// some ugly UDS business
|
|
// Borrow ipcsocket.{c,h} from nccl code
|
|
// in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the
|
|
// exported handles
|
|
// moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
|
|
|
|
pid_t currentPid = getpid();
|
|
MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
|
|
MPI_Bcast(¤tPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD);
|
|
int pidFd = syscall(SYS_pidfd_open, currentPid, 0);
|
|
|
|
// MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
|
|
// everyone else would now have same multicast object
|
|
int peerFd = 0;
|
|
peerFd = syscall(SYS_pidfd_getfd, pidFd, fd, 0);
|
|
if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, reinterpret_cast<void*>(peerFd), handleType));
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
close(fd);
|
|
// end of ugly UDS business
|
|
// everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called
|
|
int mydev = myrank;
|
|
CUCHECK(cuMulticastAddDevice(handle, mydev));
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
CUmemGenericAllocationHandle memhandle;
|
|
CUmemAllocationProp prop = {};
|
|
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
prop.location.id = mydev;
|
|
prop.requestedHandleTypes = handleType;
|
|
|
|
// allocate physical memory (data buffer)
|
|
CUCHECK(cuMemCreate(&memhandle, mcSize, &prop, 0 /*flags*/));
|
|
|
|
void* uc_va;
|
|
void* mc_va;
|
|
CUmemAccessDesc accessDesc = {};
|
|
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
accessDesc.location.id = mydev;
|
|
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
|
|
// Map a VA to UC space
|
|
CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, mcSize, minGran, 0U, 0));
|
|
CUCHECK(cuMemMap((CUdeviceptr)uc_va, mcSize, 0, memhandle, 0));
|
|
// set access on UC address
|
|
CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, mcSize, &accessDesc, 1));
|
|
// Memset uc space
|
|
cudaMemset(uc_va, 0, mcSize);
|
|
|
|
// everyone binds memory to the multicast
|
|
CUCHECK(cuMulticastBindAddr(handle, 0 /*mcOffset*/, (CUdeviceptr)uc_va, mcSize, 0));
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
// usual VA business: map both MC and PA to two different VA addresses
|
|
|
|
// Map a VA to MC space
|
|
CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0));
|
|
CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0));
|
|
// set access on MC address
|
|
CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1));
|
|
|
|
int rept = 10;
|
|
int block_size = 1024;
|
|
int nblocks = 16;
|
|
|
|
cudaDeviceSynchronize();
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
init_kernel<<<nblocks, block_size>>>((float*)uc_va, size / sizeof(float), myrank, nranks);
|
|
cudaDeviceSynchronize();
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
testing<<<nblocks, block_size>>>((float*)mc_va, size / sizeof(float), myrank, nranks);
|
|
cudaDeviceSynchronize();
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
check_correctness<<<nblocks, block_size>>>((float*)uc_va, size / sizeof(float), myrank, nranks);
|
|
cudaDeviceSynchronize();
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
for (size_t input_size = 1024; input_size <= size; input_size *= 2) {
|
|
// warmup
|
|
for (int i = 0; i < rept; i++) {
|
|
testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
|
|
}
|
|
cudaDeviceSynchronize();
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
double st = MPI_Wtime();
|
|
for (int i = 0; i < rept; i++) {
|
|
testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
|
|
}
|
|
cudaDeviceSynchronize();
|
|
double en = MPI_Wtime();
|
|
double time = (en - st) / rept;
|
|
if (!myrank)
|
|
printf("input_size %ld | Time = %f us, alg_bw = %f (GBps)\n", input_size, time * 1e6, input_size / 1e9 / time);
|
|
}
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
MPI_Finalize();
|
|
return 0;
|
|
}
|
|
|
|
#else // !(CUDA_NVLS_API_AVAILABLE)
|
|
|
|
int main() {
|
|
printf("This test requires NVLS to be enabled\n");
|
|
return 0;
|
|
}
|
|
|
|
#endif // !(CUDA_NVLS_API_AVAILABLE)
|