mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
Fix multi-node H100 CI: CUDA compat, deploy improvements (#781)
## Summary - **Multi-node H100 CI setup**: Improve architecture detection and GPU configuration - **Remove hardcoded VMSS hostnames** from deploy files - **Fix CUDA compat library issue**: Remove stale compat paths from Docker image for CUDA 12+. Instead, `peer_access_test` now returns a distinct exit code (2) for CUDA init failure, and `setup.sh` conditionally adds compat libs only when needed. This fixes `cudaErrorSystemNotReady` (error 803) when the host driver is newer than the container's compat libs. - **Speed up deploy**: Replace recursive `parallel-scp` with tar+scp+untar to avoid per-file SSH overhead. --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
|
||||
}
|
||||
}
|
||||
} else if (transports.has(Transport::CudaIpc)) {
|
||||
// When transports include both CudaIpc and IB (e.g., CudaIpc | IB0),
|
||||
// try CudaIpc first and fall back to IB on failure.
|
||||
auto entry = getTransportInfo(Transport::CudaIpc);
|
||||
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
|
||||
// Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive.
|
||||
this->remoteMemMap = gpuIpcMem->map();
|
||||
this->data = this->remoteMemMap.get();
|
||||
bool hasIB = (transports & AllIBTransports).any();
|
||||
try {
|
||||
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
|
||||
this->remoteMemMap = gpuIpcMem->map();
|
||||
this->data = this->remoteMemMap.get();
|
||||
} catch (const BaseError& e) {
|
||||
if (!hasIB) {
|
||||
throw;
|
||||
}
|
||||
bool isSameHost = (getHostHash() == this->hostHash);
|
||||
if (isSameHost) {
|
||||
WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what());
|
||||
} else {
|
||||
INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (this->data != nullptr) {
|
||||
INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);
|
||||
|
||||
Reference in New Issue
Block a user