Do not abort on NCCL initizalization failure (#1120)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2026-01-08 09:19:50 +02:00
committed by GitHub
parent 5ef98f8b0f
commit 0456aa47d3

View File

@@ -253,33 +253,32 @@ static ggml_cuda_device_info ggml_cuda_init() {
int gpu_list[GGML_CUDA_MAX_DEVICES]; int gpu_list[GGML_CUDA_MAX_DEVICES];
for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i; for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list); auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
if (status == ncclSuccess) { if (status != ncclSuccess) {
printf("=============================== NCCL initialization failed with status %d\n", int(status));
} else {
printf("=============================== NCCL main communicator initialized\n"); printf("=============================== NCCL main communicator initialized\n");
info.have_nccl = true; info.have_nccl = true;
} else {
printf("=============================== NCCL initialization failed with status %d\n", int(status));
GGML_ABORT("Fatal error");
}
auto com = info.nccl_coms + info.device_count;
if (info.device_count == 4) {
int devs[8] = {0,1, 2,3, 0,2, 1,3};
auto com = info.nccl_coms + info.device_count; auto com = info.nccl_coms + info.device_count;
for (int ip = 0; ip < 4; ++ip) { if (info.device_count == 4) {
if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) { int devs[8] = {0,1, 2,3, 0,2, 1,3};
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status)); auto com = info.nccl_coms + info.device_count;
GGML_ABORT("Fatal error"); for (int ip = 0; ip < 4; ++ip) {
if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) {
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
GGML_ABORT("Fatal error");
}
} }
} printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count); } else if (info.device_count == 3) {
} else if (info.device_count == 3) { int devs[4] = {0,1, 0,2};
int devs[4] = {0,1, 0,2}; for (int ip = 0; ip < 2; ++ip) {
for (int ip = 0; ip < 2; ++ip) { if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) {
if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) { printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status)); GGML_ABORT("Fatal error");
GGML_ABORT("Fatal error"); }
} }
printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
} }
printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
} }
} }
#endif #endif