Fix multi-node H100 CI: CUDA compat, deploy improvements (#781)

## Summary

- **Multi-node H100 CI setup**: Improve architecture detection and GPU
configuration
- **Remove hardcoded VMSS hostnames** from deploy files
- **Fix CUDA compat library issue**: Remove stale compat paths from
Docker image for CUDA 12+. Instead, `peer_access_test` now returns a
distinct exit code (2) for CUDA init failure, and `setup.sh`
conditionally adds compat libs only when needed. This fixes
`cudaErrorSystemNotReady` (error 803) when the host driver is newer than
the container's compat libs.
- **Speed up deploy**: Replace recursive `parallel-scp` with
tar+scp+untar to avoid per-file SSH overhead.

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Binyang Li
2026-04-13 21:51:29 -07:00
committed by GitHub
parent b6d0ca13ca
commit ecd33722d4
12 changed files with 200 additions and 88 deletions

View File

@@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
}
}
} else if (transports.has(Transport::CudaIpc)) {
// When transports include both CudaIpc and IB (e.g., CudaIpc | IB0),
// try CudaIpc first and fall back to IB on failure.
auto entry = getTransportInfo(Transport::CudaIpc);
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
// Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive.
this->remoteMemMap = gpuIpcMem->map();
this->data = this->remoteMemMap.get();
bool hasIB = (transports & AllIBTransports).any();
try {
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
this->remoteMemMap = gpuIpcMem->map();
this->data = this->remoteMemMap.get();
} catch (const BaseError& e) {
if (!hasIB) {
throw;
}
bool isSameHost = (getHostHash() == this->hostHash);
if (isSameHost) {
WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what());
} else {
INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what());
}
}
}
if (this->data != nullptr) {
INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);