Do not abort on NCCL initizalization failure (#1120)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-05-01 11:51:53 +00:00 · 2026-01-08 09:19:50 +02:00
parent 5ef98f8b0f
commit 0456aa47d3
1 changed files with 20 additions and 21 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -253,13 +253,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
        int gpu_list[GGML_CUDA_MAX_DEVICES];
        for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
        auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
-        if (status == ncclSuccess) {
+        if (status != ncclSuccess) {
+            printf("=============================== NCCL initialization failed with status %d\n", int(status));
+        } else {
            printf("=============================== NCCL main communicator initialized\n");
            info.have_nccl = true;
-        } else {
-            printf("=============================== NCCL initialization failed with status %d\n", int(status));
-            GGML_ABORT("Fatal error");
-        }
            auto com = info.nccl_coms + info.device_count;
            if (info.device_count == 4) {
                int devs[8] = {0,1, 2,3, 0,2, 1,3};
@@ -282,6 +280,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
                printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
            }
        }
+    }
 #endif
    return info;
 }