WIP: NCCL infra

This commit is contained in:
Iwan Kawrakow
2025-12-21 06:54:10 +00:00
parent e2f325fad3
commit 655f6ce301
4 changed files with 132 additions and 6 deletions

90
cmake/FindNCCL.cmake Normal file
View File

@@ -0,0 +1,90 @@
# Find the nccl libraries
#
# The following variables are optionally searched for defaults
# NCCL_ROOT: Base directory where all NCCL components are found
# NCCL_INCLUDE_DIR: Directory where NCCL header is found
# NCCL_LIB_DIR: Directory where NCCL library is found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIRS
# NCCL_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NCCL in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601
set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
if ($ENV{NCCL_ROOT_DIR})
message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
endif()
list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
find_path(NCCL_INCLUDE_DIRS
NAMES nccl.h
HINTS ${NCCL_INCLUDE_DIR})
if (USE_STATIC_NCCL)
MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
SET(NCCL_LIBNAME "nccl_static")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
else()
SET(NCCL_LIBNAME "nccl")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif()
find_library(NCCL_LIBRARIES
NAMES ${NCCL_LIBNAME}
HINTS ${NCCL_LIB_DIR})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
if(NCCL_FOUND) # obtaining NCCL version and some sanity checks
set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
if (NCCL_VERSION_DEFINED)
set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
file(WRITE ${file} "
#include <iostream>
#include <nccl.h>
int main()
{
std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
int x;
ncclGetVersion(&x);
return x == NCCL_VERSION_CODE;
}
")
try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
LINK_LIBRARIES ${NCCL_LIBRARIES})
if (NOT NCCL_VERSION_MATCHED)
message(FATAL_ERROR "Found NCCL header version and library version do not match! \
(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
endif()
message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
else()
message(STATUS "NCCL version < 2.3.5-5")
endif ()
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
endif()

View File

@@ -462,6 +462,19 @@ if (GGML_CUDA)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
endif()
find_package(NCCL)
if (NCCL_FOUND)
message("==================== NCCL found!")
message("NCCL_LIBRARIES = ${NCCL_LIBRARIES}")
message("NCCL_INCLUDE_DIRS = ${NCCL_INCLUDE_DIRS}")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${NCCL_LIBRARIES})
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${NCCL_INCLUDE_DIRS})
add_compile_definitions(GGML_USE_NCCL)
else()
message("==================== NCCL NOT found")
endif()
if (NOT GGML_MUSA)
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0)
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES 0)

View File

@@ -246,6 +246,20 @@ static ggml_cuda_device_info ggml_cuda_init() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
#ifdef GGML_USE_NCCL
info.have_nccl = false;
if (info.device_count > 1) {
int gpu_list[GGML_CUDA_MAX_DEVICES];
for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
if (status == ncclSuccess) {
printf("=============================== NCCL initialized\n");
info.have_nccl = true;
} else {
printf("=============================== NCCL initialization failed with status %d\n", int(status));
}
}
#endif
return info;
}
@@ -3733,12 +3747,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
#endif
#ifndef NDEBUG
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) {
assert(node->src[j]->buffer);
}
}
//assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
//for (int j = 0; j < GGML_MAX_SRC; j++) {
// if (node->src[j] != nullptr) {
// assert(node->src[j]->buffer);
// }
//}
#endif // NDEBUG
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, cgraph, i);

View File

@@ -34,6 +34,10 @@
#include "vendors/cuda.h"
#endif // defined(GGML_USE_HIPBLAS)
#ifdef GGML_USE_NCCL
#include <nccl.h>
#endif
#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -754,6 +758,11 @@ struct ggml_cuda_device_info {
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
#ifdef GGML_USE_NCCL
ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES];
bool have_nccl;
#endif
};
const ggml_cuda_device_info & ggml_cuda_info();