mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 11:51:53 +00:00
Do not abort on NCCL initizalization failure (#1120)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -253,13 +253,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
int gpu_list[GGML_CUDA_MAX_DEVICES];
|
||||
for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
|
||||
auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
|
||||
if (status == ncclSuccess) {
|
||||
if (status != ncclSuccess) {
|
||||
printf("=============================== NCCL initialization failed with status %d\n", int(status));
|
||||
} else {
|
||||
printf("=============================== NCCL main communicator initialized\n");
|
||||
info.have_nccl = true;
|
||||
} else {
|
||||
printf("=============================== NCCL initialization failed with status %d\n", int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
auto com = info.nccl_coms + info.device_count;
|
||||
if (info.device_count == 4) {
|
||||
int devs[8] = {0,1, 2,3, 0,2, 1,3};
|
||||
@@ -282,6 +280,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return info;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user