mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-20 06:49:29 +00:00
Fix cpplint error in main branch (#740)
Fix the legacy cpplint error in main branch. --------- Co-authored-by: Qinghua Zhou <qinghuahzhou@microsoft.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Binyang Li <binyli@microsoft.com>
This commit is contained in:
@@ -9,7 +9,7 @@
|
||||
#include <sstream>
|
||||
|
||||
template <typename... Args>
|
||||
void log(Args &&...args) {
|
||||
void log(Args&&... args) {
|
||||
std::stringstream ss;
|
||||
(ss << ... << args);
|
||||
ss << std::endl;
|
||||
@@ -23,7 +23,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
|
||||
__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
|
||||
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
|
||||
for (int i = 0; i < iter; ++i) {
|
||||
devHandle->relaxedWait();
|
||||
@@ -34,7 +34,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
|
||||
__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
|
||||
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
|
||||
for (int i = 0; i < iter; ++i) {
|
||||
devHandle->relaxedSignal();
|
||||
@@ -88,7 +88,7 @@ int main() {
|
||||
mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1);
|
||||
mscclpp::BaseMemoryChannel memChan0(sema0);
|
||||
mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle();
|
||||
void *devHandle0;
|
||||
void* devHandle0;
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice));
|
||||
|
||||
@@ -98,14 +98,14 @@ int main() {
|
||||
mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0);
|
||||
mscclpp::BaseMemoryChannel memChan1(sema1);
|
||||
mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle();
|
||||
void *devHandle1;
|
||||
void* devHandle1;
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle1, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle1, &memChanHandle1, sizeof(memChanHandle1), cudaMemcpyHostToDevice));
|
||||
|
||||
log("GPU 0: Launching gpuKernel0 ...");
|
||||
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(0));
|
||||
gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle0), iter);
|
||||
gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle0), iter);
|
||||
MSCCLPP_CUDATHROW(cudaGetLastError());
|
||||
|
||||
log("GPU 1: Launching gpuKernel1 ...");
|
||||
@@ -115,7 +115,7 @@ int main() {
|
||||
MSCCLPP_CUDATHROW(cudaEventCreate(&start));
|
||||
MSCCLPP_CUDATHROW(cudaEventCreate(&end));
|
||||
MSCCLPP_CUDATHROW(cudaEventRecord(start));
|
||||
gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle1), iter);
|
||||
gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle1), iter);
|
||||
MSCCLPP_CUDATHROW(cudaGetLastError());
|
||||
MSCCLPP_CUDATHROW(cudaEventRecord(end));
|
||||
MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
#define PORT_NUMBER "50505"
|
||||
|
||||
template <typename... Args>
|
||||
void log(Args &&...args) {
|
||||
void log(Args&&... args) {
|
||||
std::stringstream ss;
|
||||
(ss << ... << args);
|
||||
ss << std::endl;
|
||||
@@ -50,7 +50,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
|
||||
__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
|
||||
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
|
||||
for (int i = 0; i < iter; ++i) {
|
||||
devHandle->relaxedWait();
|
||||
@@ -61,7 +61,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
|
||||
__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
|
||||
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
|
||||
for (int i = 0; i < iter; ++i) {
|
||||
devHandle->relaxedSignal();
|
||||
@@ -115,14 +115,14 @@ void worker(int gpuId) {
|
||||
|
||||
mscclpp::BaseMemoryChannel memChan(sema);
|
||||
auto memChanHandle = memChan.deviceHandle();
|
||||
void *devHandle;
|
||||
void* devHandle;
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
|
||||
|
||||
log("GPU ", gpuId, ": Launching a GPU kernel ...");
|
||||
|
||||
if (gpuId == 0) {
|
||||
gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
|
||||
gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
|
||||
MSCCLPP_CUDATHROW(cudaGetLastError());
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
} else {
|
||||
@@ -130,7 +130,7 @@ void worker(int gpuId) {
|
||||
MSCCLPP_CUDATHROW(cudaEventCreate(&start));
|
||||
MSCCLPP_CUDATHROW(cudaEventCreate(&end));
|
||||
MSCCLPP_CUDATHROW(cudaEventRecord(start));
|
||||
gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
|
||||
gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
|
||||
MSCCLPP_CUDATHROW(cudaGetLastError());
|
||||
MSCCLPP_CUDATHROW(cudaEventRecord(end));
|
||||
MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#define PORT_NUMBER "50505"
|
||||
|
||||
template <typename... Args>
|
||||
void log(Args &&...args) {
|
||||
void log(Args&&... args) {
|
||||
std::stringstream ss;
|
||||
(ss << ... << args);
|
||||
ss << std::endl;
|
||||
@@ -47,7 +47,7 @@ int wait_process(int pid) {
|
||||
|
||||
__device__ mscclpp::DeviceSyncer devSyncer;
|
||||
|
||||
__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
|
||||
__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid == 0) {
|
||||
devHandle->relaxedSignal();
|
||||
@@ -65,7 +65,7 @@ __global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
|
||||
__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid == 0) {
|
||||
devHandle->relaxedSignal();
|
||||
@@ -79,7 +79,7 @@ __global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
|
||||
devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
|
||||
}
|
||||
|
||||
__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank,
|
||||
__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank,
|
||||
uint32_t flag) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid == 0) {
|
||||
@@ -95,7 +95,7 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand
|
||||
devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
|
||||
}
|
||||
|
||||
void worker(int myRank, int gpuId, const std::string &ipPort) {
|
||||
void worker(int myRank, int gpuId, const std::string& ipPort) {
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
|
||||
const int remoteRank = myRank == 0 ? 1 : 0;
|
||||
const int nRanks = 2;
|
||||
@@ -132,8 +132,8 @@ void worker(int myRank, int gpuId, const std::string &ipPort) {
|
||||
auto memChanHandle = memChan.deviceHandle();
|
||||
auto memPktChanHandle = memPktChan.deviceHandle();
|
||||
|
||||
void *devHandle;
|
||||
void *devPktHandle;
|
||||
void* devHandle;
|
||||
void* devPktHandle;
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&devPktHandle, sizeof(memPktChanHandle)));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
|
||||
@@ -145,18 +145,18 @@ void worker(int myRank, int gpuId, const std::string &ipPort) {
|
||||
std::function<void(size_t)> kernels[3];
|
||||
|
||||
kernels[0] = [&](size_t copyBytes) {
|
||||
bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
|
||||
copyBytes, myRank);
|
||||
bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
|
||||
myRank);
|
||||
};
|
||||
|
||||
kernels[1] = [&](size_t copyBytes) {
|
||||
bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
|
||||
copyBytes, myRank);
|
||||
bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
|
||||
myRank);
|
||||
};
|
||||
|
||||
kernels[2] = [&](size_t copyBytes) {
|
||||
static uint32_t flag = 1;
|
||||
bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devPktHandle),
|
||||
bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devPktHandle),
|
||||
copyBytes, myRank, flag++);
|
||||
};
|
||||
|
||||
@@ -215,7 +215,7 @@ void worker(int myRank, int gpuId, const std::string &ipPort) {
|
||||
bootstrap->barrier();
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int main(int argc, char** argv) {
|
||||
if (argc == 1) {
|
||||
int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
|
||||
int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
|
||||
@@ -241,7 +241,7 @@ int main(int argc, char **argv) {
|
||||
try {
|
||||
rank = std::stoi(argv[2]);
|
||||
gpuId = std::stoi(argv[3]);
|
||||
} catch (const std::exception &) {
|
||||
} catch (const std::exception&) {
|
||||
log("Error: rank and gpu_id must be valid integers.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#define PORT_NUMBER "50505"
|
||||
|
||||
template <typename... Args>
|
||||
void log(Args &&...args) {
|
||||
void log(Args&&... args) {
|
||||
std::stringstream ss;
|
||||
(ss << ... << args);
|
||||
ss << std::endl;
|
||||
@@ -45,7 +45,7 @@ int wait_process(int pid) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
|
||||
__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid == 0) {
|
||||
devHandle->signal();
|
||||
@@ -58,7 +58,7 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size
|
||||
}
|
||||
}
|
||||
|
||||
void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) {
|
||||
void worker(int rank, int gpuId, const std::string& ipPort, mscclpp::Transport transport) {
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
|
||||
const int myRank = rank;
|
||||
const int remoteRank = myRank == 0 ? 1 : 0;
|
||||
@@ -90,7 +90,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
|
||||
|
||||
auto portChanHandle = portChan.deviceHandle();
|
||||
|
||||
void *devHandle;
|
||||
void* devHandle;
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(portChanHandle)));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &portChanHandle, sizeof(portChanHandle), cudaMemcpyHostToDevice));
|
||||
|
||||
@@ -100,7 +100,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
|
||||
std::function<void(size_t)> kernels[1];
|
||||
|
||||
kernels[0] = [&](size_t copyBytes) {
|
||||
bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle *>(devHandle), copyBytes,
|
||||
bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle*>(devHandle), copyBytes,
|
||||
myRank);
|
||||
};
|
||||
|
||||
@@ -166,7 +166,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
|
||||
bootstrap->barrier();
|
||||
}
|
||||
|
||||
mscclpp::Transport parseTransport(const std::string &transportStr) {
|
||||
mscclpp::Transport parseTransport(const std::string& transportStr) {
|
||||
if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc;
|
||||
if (transportStr == "IB0") return mscclpp::Transport::IB0;
|
||||
if (transportStr == "IB1") return mscclpp::Transport::IB1;
|
||||
@@ -180,7 +180,7 @@ mscclpp::Transport parseTransport(const std::string &transportStr) {
|
||||
throw std::runtime_error("Unknown transport: " + transportStr);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int main(int argc, char** argv) {
|
||||
if (argc == 1) {
|
||||
int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
|
||||
int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
|
||||
|
||||
@@ -19,11 +19,11 @@
|
||||
#else // defined(DEBUG_BUILD)
|
||||
|
||||
#if defined(MSCCLPP_DEVICE_HIP)
|
||||
extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
|
||||
const char *__function);
|
||||
extern "C" __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
|
||||
const char* __function);
|
||||
#else // !defined(MSCCLPP_DEVICE_HIP)
|
||||
extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
|
||||
const char *__function) __THROW;
|
||||
extern "C" __host__ __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
|
||||
const char* __function) __THROW;
|
||||
#endif // !defined(MSCCLPP_DEVICE_HIP)
|
||||
|
||||
/// Assert a condition on the device and print a message if the condition is false.
|
||||
|
||||
@@ -11,17 +11,17 @@ using namespace mscclpp;
|
||||
|
||||
#define REGISTER_EXCEPTION_TRANSLATOR(name_) \
|
||||
nb::register_exception_translator( \
|
||||
[](const std::exception_ptr &p, void *payload) { \
|
||||
[](const std::exception_ptr& p, void* payload) { \
|
||||
try { \
|
||||
std::rethrow_exception(p); \
|
||||
} catch (const name_ &e) { \
|
||||
PyErr_SetObject(reinterpret_cast<PyObject *>(payload), \
|
||||
} catch (const name_& e) { \
|
||||
PyErr_SetObject(reinterpret_cast<PyObject*>(payload), \
|
||||
PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \
|
||||
} \
|
||||
}, \
|
||||
m.attr(#name_).ptr());
|
||||
|
||||
void register_error(nb::module_ &m) {
|
||||
void register_error(nb::module_& m) {
|
||||
nb::enum_<ErrorCode>(m, "CppErrorCode")
|
||||
.value("SystemError", ErrorCode::SystemError)
|
||||
.value("InternalError", ErrorCode::InternalError)
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
namespace nb = nanobind;
|
||||
|
||||
void register_npkit(nb::module_ &m) {
|
||||
void register_npkit(nb::module_& m) {
|
||||
nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions");
|
||||
sub_m.def("init", &NpKit::Init);
|
||||
sub_m.def("dump", &NpKit::Dump);
|
||||
|
||||
@@ -6,7 +6,7 @@ int getDeviceNumaNode(int cudaDev);
|
||||
void numaBind(int node);
|
||||
}; // namespace mscclpp
|
||||
|
||||
void register_numa(nb::module_ &m) {
|
||||
void register_numa(nb::module_& m) {
|
||||
nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions");
|
||||
sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode);
|
||||
sub_m.def("numa_bind", &mscclpp::numaBind);
|
||||
|
||||
@@ -23,14 +23,14 @@ void CudaIpcStream::setStreamIfNeeded() {
|
||||
}
|
||||
}
|
||||
|
||||
void CudaIpcStream::memcpyD2D(void *dst, const void *src, size_t nbytes) {
|
||||
void CudaIpcStream::memcpyD2D(void* dst, const void* src, size_t nbytes) {
|
||||
CudaDeviceGuard deviceGuard(deviceId_);
|
||||
setStreamIfNeeded();
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_));
|
||||
dirty_ = true;
|
||||
}
|
||||
|
||||
void CudaIpcStream::memcpyH2D(void *dst, const void *src, size_t nbytes) {
|
||||
void CudaIpcStream::memcpyH2D(void* dst, const void* src, size_t nbytes) {
|
||||
CudaDeviceGuard deviceGuard(deviceId_);
|
||||
setStreamIfNeeded();
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_));
|
||||
@@ -48,7 +48,7 @@ void CudaIpcStream::sync() {
|
||||
|
||||
Context::Impl::Impl() {}
|
||||
|
||||
IbCtx *Context::Impl::getIbContext(Transport ibTransport) {
|
||||
IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
|
||||
// Find IB context or create it
|
||||
auto it = ibContexts_.find(ibTransport);
|
||||
if (it == ibContexts_.end()) {
|
||||
@@ -70,7 +70,7 @@ MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique<Impl>()) {}
|
||||
|
||||
MSCCLPP_API_CPP Context::~Context() = default;
|
||||
|
||||
MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void *ptr, size_t size, TransportFlags transports) {
|
||||
MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void* ptr, size_t size, TransportFlags transports) {
|
||||
return RegisteredMemory(std::make_shared<RegisteredMemory::Impl>(ptr, size, transports, *pimpl_));
|
||||
}
|
||||
|
||||
@@ -78,7 +78,7 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) {
|
||||
return Endpoint(std::make_shared<Endpoint::Impl>(config, *pimpl_));
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Connection Context::connect(const Endpoint &localEndpoint, const Endpoint &remoteEndpoint) {
|
||||
MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) {
|
||||
if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) {
|
||||
throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage);
|
||||
}
|
||||
|
||||
@@ -24,9 +24,9 @@ class CudaIpcStream {
|
||||
public:
|
||||
CudaIpcStream(int deviceId);
|
||||
|
||||
void memcpyD2D(void *dst, const void *src, size_t nbytes);
|
||||
void memcpyD2D(void* dst, const void* src, size_t nbytes);
|
||||
|
||||
void memcpyH2D(void *dst, const void *src, size_t nbytes);
|
||||
void memcpyH2D(void* dst, const void* src, size_t nbytes);
|
||||
|
||||
void sync();
|
||||
|
||||
@@ -44,7 +44,7 @@ struct Context::Impl {
|
||||
|
||||
Impl();
|
||||
|
||||
IbCtx *getIbContext(Transport ibTransport);
|
||||
IbCtx* getIbContext(Transport ibTransport);
|
||||
std::shared_ptr<uint64_t> getToken();
|
||||
};
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ struct GpuIpcMemHandle {
|
||||
char handle[64];
|
||||
} fabric;
|
||||
|
||||
static void deleter(GpuIpcMemHandle *handle);
|
||||
static void deleter(GpuIpcMemHandle* handle);
|
||||
|
||||
// We make GpuIpcMemHandle trivially copyable for easy serialization,
|
||||
// and thus it cannot have explicit destructors.
|
||||
@@ -61,7 +61,7 @@ struct GpuIpcMemHandle {
|
||||
using Base::Base;
|
||||
|
||||
// Allow implicit conversion from Base
|
||||
UniquePtr(Base &&other) : Base(std::move(other)) {}
|
||||
UniquePtr(Base&& other) : Base(std::move(other)) {}
|
||||
};
|
||||
|
||||
static UniquePtr create(const CUdeviceptr ptr);
|
||||
@@ -70,7 +70,7 @@ struct GpuIpcMemHandle {
|
||||
|
||||
using UniqueGpuIpcMemHandle = GpuIpcMemHandle::UniquePtr;
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const GpuIpcMemHandle::TypeFlags &typeFlags);
|
||||
std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typeFlags);
|
||||
|
||||
static_assert(std::is_trivially_copyable_v<GpuIpcMemHandle>);
|
||||
|
||||
@@ -82,7 +82,7 @@ class GpuIpcMem : public std::enable_shared_from_this<GpuIpcMem> {
|
||||
/// Create a GpuIpcMem instance from a GpuIpcMemHandle.
|
||||
/// @param handle The handle to import.
|
||||
/// @return A shared_ptr to the created GpuIpcMem instance.
|
||||
static std::shared_ptr<GpuIpcMem> create(const GpuIpcMemHandle &handle);
|
||||
static std::shared_ptr<GpuIpcMem> create(const GpuIpcMemHandle& handle);
|
||||
|
||||
~GpuIpcMem();
|
||||
|
||||
@@ -102,7 +102,7 @@ class GpuIpcMem : public std::enable_shared_from_this<GpuIpcMem> {
|
||||
std::shared_ptr<void> mapMulticast(int numDevices, size_t mcOffset, CUdeviceptr bufferAddr, size_t bufferSize);
|
||||
|
||||
private:
|
||||
GpuIpcMem(const GpuIpcMemHandle &handle);
|
||||
GpuIpcMem(const GpuIpcMemHandle& handle);
|
||||
|
||||
GpuIpcMemHandle handle_;
|
||||
CUmemGenericAllocationHandle allocHandle_;
|
||||
|
||||
@@ -12,12 +12,12 @@ namespace mscclpp {
|
||||
|
||||
struct IBVerbs {
|
||||
private:
|
||||
static void *dlsym(const std::string &symbol, bool allowReturnNull = false);
|
||||
static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
|
||||
|
||||
public:
|
||||
#define REGISTER_IBV_FUNC_WITH_NAME(name__, func__) \
|
||||
template <typename... Args> \
|
||||
static inline auto(name__)(Args && ...args) { \
|
||||
static inline auto(name__)(Args && ... args) { \
|
||||
static_assert(sizeof(&::func__) > 0, #func__ " is expected be a function, not a macro"); \
|
||||
static decltype(&::func__) impl = nullptr; \
|
||||
if (!impl) impl = reinterpret_cast<decltype(impl)>(IBVerbs::dlsym(#func__)); \
|
||||
@@ -46,7 +46,7 @@ struct IBVerbs {
|
||||
REGISTER_IBV_FUNC(ibv_wc_status_str)
|
||||
|
||||
static bool isDmabufSupported();
|
||||
static struct ibv_mr *ibv_reg_dmabuf_mr(struct ibv_pd *, uint64_t, size_t, uint64_t, int, int);
|
||||
static struct ibv_mr* ibv_reg_dmabuf_mr(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int);
|
||||
|
||||
///
|
||||
/// Below is for cases where the API (may be / is) a macro. Refer to `infiniband/verbs.h`.
|
||||
@@ -57,8 +57,8 @@ struct IBVerbs {
|
||||
#else // defined(ibv_get_device_list)
|
||||
#undef ibv_get_device_list
|
||||
REGISTER_IBV_FUNC(ibv_static_providers)
|
||||
static inline struct ibv_device **ibv_get_device_list(int *num_devices) {
|
||||
using FuncType = struct ibv_device **(*)(int *);
|
||||
static inline struct ibv_device** ibv_get_device_list(int* num_devices) {
|
||||
using FuncType = struct ibv_device** (*)(int*);
|
||||
static FuncType impl = nullptr;
|
||||
if (!impl) impl = reinterpret_cast<FuncType>(IBVerbs::dlsym("ibv_get_device_list"));
|
||||
IBVerbs::ibv_static_providers(NULL, _RDMA_STATIC_PREFIX(RDMA_STATIC_PROVIDERS), NULL);
|
||||
@@ -67,21 +67,21 @@ struct IBVerbs {
|
||||
#endif // defined(ibv_get_device_list)
|
||||
|
||||
#undef ibv_query_port
|
||||
static inline int ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) {
|
||||
static inline int ibv_query_port(struct ibv_context* context, uint8_t port_num, struct ibv_port_attr* port_attr) {
|
||||
static decltype(&::ibv_query_port) impl = nullptr;
|
||||
if (!impl) impl = reinterpret_cast<decltype(impl)>(IBVerbs::dlsym("ibv_query_port"));
|
||||
struct verbs_context *vctx = verbs_get_ctx_op(context, query_port);
|
||||
struct verbs_context* vctx = verbs_get_ctx_op(context, query_port);
|
||||
if (!vctx) {
|
||||
int rc;
|
||||
::memset(port_attr, 0, sizeof(*port_attr));
|
||||
rc = impl(context, port_num, (struct _compat_ibv_port_attr *)port_attr);
|
||||
rc = impl(context, port_num, (struct _compat_ibv_port_attr*)port_attr);
|
||||
return rc;
|
||||
}
|
||||
return vctx->query_port(context, port_num, port_attr, sizeof(*port_attr));
|
||||
}
|
||||
|
||||
#undef ibv_reg_mr
|
||||
static inline struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
|
||||
static inline struct ibv_mr* ibv_reg_mr(struct ibv_pd* pd, void* addr, size_t length, int access) {
|
||||
static decltype(&::ibv_reg_mr) impl = nullptr;
|
||||
static decltype(&::ibv_reg_mr_iova2) impl_iova2 = nullptr;
|
||||
int is_access_const = __builtin_constant_p(((int)(access)&IBV_ACCESS_OPTIONAL_RANGE) == 0);
|
||||
@@ -98,11 +98,11 @@ struct IBVerbs {
|
||||
/// Below is for cases where the API (may be / is) a static function. Refer to `infiniband/verbs.h`.
|
||||
///
|
||||
|
||||
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
|
||||
static inline int ibv_post_send(struct ibv_qp* qp, struct ibv_send_wr* wr, struct ibv_send_wr** bad_wr) {
|
||||
return qp->context->ops.post_send(qp, wr, bad_wr);
|
||||
}
|
||||
|
||||
static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) {
|
||||
static inline int ibv_poll_cq(struct ibv_cq* cq, int num_entries, struct ibv_wc* wc) {
|
||||
return cq->context->ops.poll_cq(cq, num_entries, wc);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -8,11 +8,11 @@
|
||||
|
||||
extern "C" __attribute__((visibility("default"))) unsigned int la_version(unsigned int) { return LAV_CURRENT; }
|
||||
|
||||
extern "C" __attribute__((visibility("default"))) char *la_objsearch(const char *name, uintptr_t *, unsigned int) {
|
||||
const char *library = "libmscclpp_nccl.so";
|
||||
extern "C" __attribute__((visibility("default"))) char* la_objsearch(const char* name, uintptr_t*, unsigned int) {
|
||||
const char* library = "libmscclpp_nccl.so";
|
||||
if (strcmp(name, "libnccl.so.2") && strcmp(name, "libnccl.so") && strcmp(name, "librccl.so") &&
|
||||
strcmp(name, "librccl.so.1")) {
|
||||
return (char *)name;
|
||||
return (char*)name;
|
||||
}
|
||||
return (char *)library;
|
||||
return (char*)library;
|
||||
}
|
||||
@@ -12,10 +12,10 @@
|
||||
|
||||
__constant__ mscclpp::PortChannelDeviceHandle gPortChannel;
|
||||
|
||||
__global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, int *ret) {
|
||||
__global__ void kernelLocalPortChannelTest(void* dst, void* src, size_t bytes, int* ret) {
|
||||
if (blockIdx.x == 0) {
|
||||
// sender
|
||||
int *ptr = reinterpret_cast<int *>(src);
|
||||
int* ptr = reinterpret_cast<int*>(src);
|
||||
for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) {
|
||||
ptr[idx] = MAGIC_CONST;
|
||||
}
|
||||
@@ -29,7 +29,7 @@ __global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, i
|
||||
gPortChannel.wait();
|
||||
}
|
||||
__syncthreads();
|
||||
int *ptr = reinterpret_cast<int *>(dst);
|
||||
int* ptr = reinterpret_cast<int*>(dst);
|
||||
for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) {
|
||||
if (ptr[idx] != MAGIC_CONST) {
|
||||
*ret = 1; // Error: value mismatch
|
||||
|
||||
Reference in New Issue
Block a user