Do not abort on NCCL initizalization failure (#1120)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2026-01-08 09:19:50 +02:00
committed by GitHub
parent d581d75537
commit 0c2d924e94

View File

@@ -253,13 +253,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
int gpu_list[GGML_CUDA_MAX_DEVICES];
for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
if (status == ncclSuccess) {
if (status != ncclSuccess) {
printf("=============================== NCCL initialization failed with status %d\n", int(status));
} else {
printf("=============================== NCCL main communicator initialized\n");
info.have_nccl = true;
} else {
printf("=============================== NCCL initialization failed with status %d\n", int(status));
GGML_ABORT("Fatal error");
}
auto com = info.nccl_coms + info.device_count;
if (info.device_count == 4) {
int devs[8] = {0,1, 2,3, 0,2, 1,3};
@@ -282,6 +280,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
}
}
}
#endif
return info;
}