diff --git a/cmake/FindNCCL.cmake b/cmake/FindNCCL.cmake new file mode 100644 index 00000000..55123327 --- /dev/null +++ b/cmake/FindNCCL.cmake @@ -0,0 +1,90 @@ +# Find the nccl libraries +# +# The following variables are optionally searched for defaults +# NCCL_ROOT: Base directory where all NCCL components are found +# NCCL_INCLUDE_DIR: Directory where NCCL header is found +# NCCL_LIB_DIR: Directory where NCCL library is found +# +# The following are set after configuration is done: +# NCCL_FOUND +# NCCL_INCLUDE_DIRS +# NCCL_LIBRARIES +# +# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks +# install NCCL in the same location as the CUDA toolkit. +# See https://github.com/caffe2/caffe2/issues/1601 + +set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers") +set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries") +set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with") + +if ($ENV{NCCL_ROOT_DIR}) + message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.") +endif() +list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}) +# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12. +list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT}) + +find_path(NCCL_INCLUDE_DIRS + NAMES nccl.h + HINTS ${NCCL_INCLUDE_DIR}) + +if (USE_STATIC_NCCL) + MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.") + SET(NCCL_LIBNAME "nccl_static") + if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified + set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif() +else() + SET(NCCL_LIBNAME "nccl") + if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified + set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif() +endif() + +find_library(NCCL_LIBRARIES + NAMES ${NCCL_LIBNAME} + HINTS ${NCCL_LIB_DIR}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES) + +if(NCCL_FOUND) # obtaining NCCL version and some sanity checks + set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h") + message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...") + set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES}) + list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS}) + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED) + + if (NCCL_VERSION_DEFINED) + set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc") + file(WRITE ${file} " + #include + #include + int main() + { + std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl; + int x; + ncclGetVersion(&x); + return x == NCCL_VERSION_CODE; + } +") + try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file} + RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}" + LINK_LIBRARIES ${NCCL_LIBRARIES}) + if (NOT NCCL_VERSION_MATCHED) + message(FATAL_ERROR "Found NCCL header version and library version do not match! \ +(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.") + endif() + message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}") + else() + message(STATUS "NCCL version < 2.3.5-5") + endif () + set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES}) + + message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})") + mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES) +endif() + diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 6ba18d92..88ee35ea 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -97,6 +97,7 @@ endif() option(GGML_LASX "ggml: enable lasx" ON) option(GGML_LSX "ggml: enable lsx" ON) option(GGML_SVE "ggml: enable SVE" OFF) +option(GGML_NCCL "ggml: enable NCCL" ON) if (WIN32) set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version") diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index c1b4c8ef..9f312863 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -689,6 +689,9 @@ extern "C" { GGML_OP_GLU, + GGML_OP_REDUCE, + GGML_OP_FAKE_CPY, + GGML_OP_COUNT, }; @@ -3034,6 +3037,17 @@ extern "C" { struct ggml_tensor ** splits; } ggml_split_tensor_t; + GGML_API struct ggml_tensor * ggml_reduce( + struct ggml_context * ctx, + struct ggml_tensor ** a, + int n, + enum ggml_op op); + + GGML_API struct ggml_tensor * ggml_fake_cpy( + struct ggml_context * ctx, + struct ggml_tensor * dst, + struct ggml_tensor * src); + #ifdef __cplusplus } #endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index c9acf1fc..c00c2f23 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -462,6 +462,21 @@ if (GGML_CUDA) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... endif() endif() + + if (GGML_NCCL) + find_package(NCCL) + if (NCCL_FOUND) + message("==================== NCCL found!") + message("NCCL_LIBRARIES = ${NCCL_LIBRARIES}") + message("NCCL_INCLUDE_DIRS = ${NCCL_INCLUDE_DIRS}") + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${NCCL_LIBRARIES}) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${NCCL_INCLUDE_DIRS}) + add_compile_definitions(GGML_USE_NCCL) + else() + message("==================== NCCL NOT found -> building wihout NCCL support") + endif() + endif() + if (NOT GGML_MUSA) set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0) set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES 0) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 30db6939..65739cf3 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1414,13 +1414,59 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // do not overwrite user assignments if (*leaf_backend_id == -1) { *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); - //printf("Pass 1: assigned backend %d to leaf %d, %s\n", *leaf_backend_id, i, graph->leafs[i]->name); } } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * node_backend_id = &tensor_backend_id(node); + if (node->op == GGML_OP_REDUCE) { + auto view_src = node->view_src; + int src_id = -1; + for (int j = 0; j < node->op_params[1]; ++j) { + if (node->src[j]) { + int * this_node_backend_id = &tensor_backend_id(node->src[j]); + if (*this_node_backend_id == -1) { + *this_node_backend_id = j; + } else { + GGML_ASSERT(*this_node_backend_id == j); + } + if (view_src == node->src[j]) { + src_id = j; + } + } + } + if (src_id >= 0) { + int * this_node_backend_id = &tensor_backend_id(view_src); + *this_node_backend_id = tensor_backend_id(node->src[src_id]); + *node_backend_id = *this_node_backend_id; + } + } + else if (node->op == GGML_OP_MUL && node->src[0]->op == GGML_OP_NORM) { + // This is a hack for Cohere2. Without this hack the scheduler creates + // totally nonsensical splits for that arch + int * src1_id = &tensor_backend_id(node->src[1]); + if (*src1_id >= 0) { + int * src0_id = &tensor_backend_id(node->src[0]); + int * dst_id = &tensor_backend_id(node); + *src0_id = *src1_id; + *dst_id = *src1_id; + // For some reason that I don't understand, we can have norm backend already assigned + // at this point. How? That's why this more logical approach of first checking is commented out + //if (*src0_id < 0) { + // *src0_id = *src1_id; + //} else { + // printf("Oops: backend_id_src0(%s) = %d, backend_id_src1(%s) = %d\n", node->src[0]->name, *src0_id, node->src[1]->name, *src1_id); + // //GGML_ASSERT(*src0_id == *src1_id); + //} + //if (*dst_id < 0) { + // *dst_id = *src1_id; + //} else { + // printf("Oops: backend_id_dst(%s) = %d, backend_id_src1(%s) = %d\n", node->name, *dst_id, node->src[1]->name, *src1_id); + // //GGML_ASSERT(*dst_id == *src1_id); + //} + } + } // do not overwrite user assignments if (*node_backend_id == -1) { *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); @@ -1652,6 +1698,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // check if we should start a new split based on the sources of the current node bool need_new_split = false; if ((node->op == GGML_OP_ADD && node->op_params[0] == 0xff) || + node->op == GGML_OP_REDUCE || + node->op == GGML_OP_FAKE_CPY || node->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] == 0xff) { need_new_split = true; } @@ -1739,6 +1787,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { // create a copy of the input in the split's backend if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) { + if (node->op == GGML_OP_REDUCE) { + //printf("setting tensor_id_copy(reduce, %zu, %d, %s) to %s\n", src_id, cur_backend_id, node->name, src->name); + tensor_id_copy(src_id, cur_backend_id, 0) = src; + } else if (node->op == GGML_OP_FAKE_CPY && src->op == GGML_OP_REDUCE) { + //printf("setting tensor_id_copy(fake_cpy, %zu, %d, %s) to %s\n", src_id, cur_backend_id, node->name, src->src[j]->name); + tensor_id_copy(src_id, cur_backend_id, 0) = src->src[j]; + } else { ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); @@ -1753,6 +1808,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg int n_inputs = split->n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split->inputs[n_inputs] = src; + } } node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } @@ -2027,80 +2083,8 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back } } -static ggml_status ggml_backend_sched_compute_splits_sm_graph(ggml_backend_sched_t sched) { - std::vector ids; - std::vector unique_ids; - ggml_tensor * last_ids_tensor = nullptr; - - std::array needs_sync{{true}}; - - auto splits = sched->splits; - - std::vector this_split; - for (int i = 0; i < sched->n_splits; ++i) { - auto split_i = &splits[i]; - this_split.clear(); - this_split.push_back(split_i); - for (int j = i+1; j < sched->n_splits; ++j) { - auto split_j = &splits[j]; - if (split_i->backend_id == split_j->backend_id) { - break; - } - int n_nodes = std::min(split_i->graph.n_nodes, split_j->graph.n_nodes); - bool same = true; - for (int k = 0; k < n_nodes; ++k) { - if (split_i->graph.nodes[k]->op != split_j->graph.nodes[k]->op) { - same = false; break; - } - } - if (!same) { - break; - } - this_split.push_back(split_j); - } - if (false) { - auto split = this_split.front(); - if (this_split.size() == 1) { - printf("=== Split %d with %d inputs on backend %d\n", i, split->n_inputs, split->backend_id); - } else { - printf("=== Split %d with %d inputs on backends", i, split->n_inputs); - for (int j = 0; j < (int)this_split.size(); ++j) printf(" %d", this_split[j]->backend_id); - printf("\n"); - } - for (int j = 0; j < split->graph.n_nodes; ++j) { - printf(" %d %s(%s)\n", j, ggml_op_name(split->graph.nodes[j]->op), split->graph.nodes[j]->name); - } - } - for (auto split : this_split) { - ggml_backend_sched_copy_inputs(sched, split, needs_sync, ids, unique_ids, last_ids_tensor); - } - for (auto split : this_split) { - auto split_backend_id = split->backend_id; - if (split->n_inputs > 0) { - needs_sync[split_backend_id] = true; - } - auto split_backend = sched->backends[split_backend_id]; - auto ec = ggml_backend_graph_compute_async(split_backend, &split->graph); - if (ec != GGML_STATUS_SUCCESS) { - return ec; - } - if (split->n_inputs > 0) { - if (sched->events[split_backend_id][sched->cur_copy] != NULL) { - ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]); - } - } - } - i += this_split.size() - 1; - } - return GGML_STATUS_SUCCESS; -} - static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { - if (false && sched->split_mode_graph) { - return ggml_backend_sched_compute_splits_sm_graph(sched); - } - std::array needs_sync{{true}}; std::array own_cpy{{false}}; diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index b6dc3502..c2e4e688 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -48,6 +48,7 @@ #include "ggml-cuda/argmax.cuh" #include "ggml-cuda/multiadd.cuh" #include "ggml-cuda/hadamard.cuh" +#include "ggml-cuda/reduce.cuh" #include #include @@ -143,7 +144,7 @@ int ggml_cuda_get_device() { return id; } -static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { +cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { ggml_cuda_set_device(device); #if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA) auto res = hipMallocManaged(ptr, size); @@ -246,6 +247,42 @@ static ggml_cuda_device_info ggml_cuda_init() { // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); +#ifdef GGML_USE_NCCL + info.have_nccl = false; + if (info.device_count > 1) { + int gpu_list[GGML_CUDA_MAX_DEVICES]; + for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i; + auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list); + if (status == ncclSuccess) { + printf("=============================== NCCL main communicator initialized\n"); + info.have_nccl = true; + } else { + printf("=============================== NCCL initialization failed with status %d\n", int(status)); + GGML_ABORT("Fatal error"); + } + auto com = info.nccl_coms + info.device_count; + if (info.device_count == 4) { + int devs[8] = {0,1, 2,3, 0,2, 1,3}; + auto com = info.nccl_coms + info.device_count; + for (int ip = 0; ip < 4; ++ip) { + if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) { + printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status)); + GGML_ABORT("Fatal error"); + } + } + printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count); + } else if (info.device_count == 3) { + int devs[4] = {0,1, 0,2}; + for (int ip = 0; ip < 2; ++ip) { + if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) { + printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status)); + GGML_ABORT("Fatal error"); + } + } + printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count); + } + } +#endif return info; } @@ -465,6 +502,11 @@ static std::atomic ggml_cuda_lock_counter; ggml_backend_cuda_context::ggml_backend_cuda_context(int device) : device(device), name(GGML_CUDA_NAME + std::to_string(device)) { + auto info = const_cast(&ggml_cuda_info()); + if (info->all_ctx[device]) { + GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device); + } + info->all_ctx[device] = this; } ggml_backend_cuda_context::~ggml_backend_cuda_context() { @@ -472,6 +514,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() { std::unique_lock lock(ggml_cuda_lock); ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; }); + auto info = const_cast(&ggml_cuda_info()); + info->all_ctx[this->device] = nullptr; + if (copy_event != nullptr) { CUDA_CHECK(cudaEventDestroy(copy_event)); } @@ -2934,6 +2979,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg //printf("%4d %s(%s) on device %d. time = %ld\n", i, ggml_op_name(dst->op), dst->name, ctx.device, ggml_time_us()); switch (dst->op) { + case GGML_OP_REDUCE: + ggml_cuda_op_reduce(ctx, dst); + break; + case GGML_OP_FAKE_CPY: + break; case GGML_OP_ARGMAX: ggml_cuda_argmax(ctx, dst); break; @@ -3451,8 +3501,23 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ needs_f16_f32_copy = true; } else { +#ifdef GGML_USE_NCCL__ + auto & info = ggml_cuda_info(); + auto nbytes = ggml_nbytes(src); + ncclGroupStart(); + ggml_cuda_set_device(cuda_ctx_src->device); + auto status1 = ncclSend(src->data, nbytes, ncclUint8, cuda_ctx_dst->device, info.nccl_coms[cuda_ctx_src->device], + info.all_ctx[cuda_ctx_src->device]->stream()); + ggml_cuda_set_device(cuda_ctx_dst->device); + auto status2 = ncclRecv(dst->data, nbytes, ncclUint8, cuda_ctx_src->device, info.nccl_coms[cuda_ctx_dst->device], + info.all_ctx[cuda_ctx_dst->device]->stream()); + ncclGroupEnd(); + GGML_ASSERT(status1 == ncclSuccess && status2 == ncclSuccess); + return true; +#else ggml_cuda_set_device(cuda_ctx_src->device); CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); +#endif } #endif } @@ -3733,12 +3798,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } #endif #ifndef NDEBUG - assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (node->src[j] != nullptr) { - assert(node->src[j]->buffer); - } - } + //assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + //for (int j = 0; j < GGML_MAX_SRC; j++) { + // if (node->src[j] != nullptr) { + // assert(node->src[j]->buffer); + // } + //} #endif // NDEBUG bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, cgraph, i); @@ -4044,6 +4109,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons } return false; } break; + case GGML_OP_REDUCE: + case GGML_OP_FAKE_CPY: case GGML_OP_ARGMAX: return true; case GGML_OP_HADAMARD: @@ -4372,6 +4439,13 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] con #endif } +#ifdef GGML_USE_NCCL + if (!enable_p2p) { + printf("================== P2P disabled, but needed for NCCL\n"); + enable_p2p = true; + } +#endif + #if !defined(GGML_CUDA_NO_PEER_COPY) if (enable_p2p) { ggml_cuda_set_peer_access(device); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 90484ad4..27909321 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -34,6 +34,10 @@ #include "vendors/cuda.h" #endif // defined(GGML_USE_HIPBLAS) +#ifdef GGML_USE_NCCL +#include +#endif + #define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) @@ -738,6 +742,8 @@ struct ggml_cuda_type_traits { ////////////////////// +struct ggml_backend_cuda_context; + struct ggml_cuda_device_info { int device_count; @@ -754,6 +760,12 @@ struct ggml_cuda_device_info { cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {}; std::array default_tensor_split = {}; + + ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr }; +#ifdef GGML_USE_NCCL + ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES]; + bool have_nccl; +#endif }; const ggml_cuda_device_info & ggml_cuda_info(); @@ -844,6 +856,9 @@ struct ggml_backend_cuda_context { bool use_cuda_graph = true; #endif + void * copy_buffer = nullptr; + size_t copy_size = 0; + explicit ggml_backend_cuda_context(int device); ~ggml_backend_cuda_context(); @@ -889,3 +904,5 @@ struct ggml_backend_cuda_context { return pool(device); } }; + +cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device); diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu new file mode 100644 index 00000000..3ad4fb1f --- /dev/null +++ b/ggml/src/ggml-cuda/reduce.cu @@ -0,0 +1,319 @@ +// +// Copyright (C) 2023-2024 The ggml authors +// Copyright (C) 2024 Iwan Kawrakow +// MIT license +// SPDX-License-Identifier: MIT +// + +#include "reduce.cuh" + +#include + +template +static __global__ void k_add(int nelem, const T * src, T * dst) { + int i = blockIdx.x*block_size + threadIdx.x; + if (i >= nelem) return; + dst[i] += src[i]; +} + +template +static __global__ void k_add_sym(int nelem, T * src, T * dst) { + int i = blockIdx.x*block_size + threadIdx.x; + if (i >= nelem) return; + dst[i] += src[i]; + src[i] = dst[i]; +} + +struct copy_task { + void * ptrs[GGML_CUDA_MAX_DEVICES]; + int nptr; + int nelem; +}; + +template +static __global__ void k_reduce_add(copy_task task) { + int i = blockIdx.x*block_size + threadIdx.x; + if (i >= task.nelem) return; + auto dst = (T *)task.ptrs[0]; + for (int j = 1; j < task.nptr; ++j) { + auto src = (T *)task.ptrs[j]; + dst[i] += src[i]; + } + for (int j = 1; j < task.nptr; ++j) { + auto src = (T *)task.ptrs[j]; + src[i] = dst[i]; + } +} + +void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + + auto op = (ggml_op)dst->op_params[0]; + GGML_ASSERT(op == GGML_OP_ADD); + int nreduce = dst->op_params[1]; + int nhave = dst->op_params[2]; + GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(nhave >=2 && nhave <= nreduce); + + auto & info = ggml_cuda_info(); +#ifdef GGML_USE_NCCL + if (info.have_nccl && nhave == nreduce) { // somehow I'm not able to figure out how to use NCCL when not all GPUs participate in the reduce op + GGML_ASSERT(info.have_nccl); + GGML_ASSERT(info.device_count == nreduce); + auto type = dst->type; + //int device = ctx.device; + if (nreduce != info.device_count) { + GGML_ABORT("Not implemented"); + } + //auto tim1 = std::chrono::steady_clock::now(); + auto data_type = type == GGML_TYPE_F32 ? ncclFloat : ncclHalf; + if (nreduce == 4 && dst->ne[1] > 32) { + auto com = info.nccl_coms + info.device_count; + static const int devs[8] = {0,1, 2,3, 0,2, 1,3}; + for (int ip = 0; ip < 4; ++ip) { + ncclGroupStart(); + ggml_cuda_set_device(devs[2*ip+0]); + auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data, + ggml_nelements(dst), data_type, ncclSum, com[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream()); + ggml_cuda_set_device(devs[2*ip+1]); + auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data, + ggml_nelements(dst), data_type, ncclSum, com[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream()); + ncclGroupEnd(); + if (status1 != ncclSuccess || status2 != ncclSuccess) { + fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2); + GGML_ABORT("Fatal error"); + } + } + } + else if (nreduce == 3 && dst->ne[1] > 32) { + auto com = info.nccl_coms + info.device_count; + static const int devs[4] = {0,1, 0,2}; + for (int ip = 0; ip < 2; ++ip) { + ncclGroupStart(); + ggml_cuda_set_device(devs[2*ip+0]); + auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data, + ggml_nelements(dst), data_type, ncclSum, com[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream()); + ggml_cuda_set_device(devs[2*ip+1]); + auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data, + ggml_nelements(dst), data_type, ncclSum, com[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream()); + ncclGroupEnd(); + if (status1 != ncclSuccess || status2 != ncclSuccess) { + fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2); + GGML_ABORT("Fatal error"); + } + } + ncclGroupStart(); + ggml_cuda_set_device(0); + auto status1 = ncclSend(dst->src[0]->data, ggml_nelements(dst), data_type, 1, com[0], info.all_ctx[0]->stream()); + ggml_cuda_set_device(1); + auto status2 = ncclRecv(dst->src[1]->data, ggml_nelements(dst), data_type, 0, com[1], info.all_ctx[1]->stream()); + ncclGroupEnd(); + if (status1 != ncclSuccess || status2 != ncclSuccess) { + fprintf(stderr, "%s: ncclSend/Recv failed with statuses %d, %d\n", __func__, (int)status1, (int)status2); + GGML_ABORT("Fatal error"); + } + } + else { + ncclGroupStart(); + for (int i = 0; i < nreduce; ++i) { + ncclComm_t this_comm; + if (nhave == nreduce) { + this_comm = info.nccl_coms[i]; + } else { + auto status = ncclCommSplit(info.nccl_coms[i], dst->src[i] ? 0 : NCCL_SPLIT_NOCOLOR, i, &this_comm, NULL); + GGML_ASSERT(status == ncclSuccess); + } + ggml_cuda_set_device(i); + auto stream = info.all_ctx[i]->stream(); + GGML_ASSERT(stream); + auto status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr, + dst->src[i] ? dst->src[i]->data : nullptr, + ggml_nelements(dst), data_type, ncclSum, this_comm, stream); + if (status != ncclSuccess) { + fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status); + GGML_ABORT("Fatal error"); + } + } + ncclGroupEnd(); + } + ggml_cuda_set_device(ctx.device); + return; + } +#endif + GGML_ASSERT(dst->data == dst->src[ctx.device]->data); + auto nbytes = ggml_nbytes(dst); + if (nhave == 2 && (nhave == nreduce || dst->ne[1] <= 8)) { + int idx[2]; + int ii = 0; + for (int i = 0; i < nreduce; ++i) { + if (dst->src[i]) { + idx[ii++] = i; + } + } + // With P2P access enabled, we can access peer memory so as if it was local. + // Hence, we can launch two reduce kernels, one on each device, each kernel + // processing half of the data. This very simply approach almost matches NCCL + // performance (I see ~1% lower PP and TG performance on my 2x3090 system). + for (int i = 0; i < nhave; ++i) { + GGML_ASSERT(dst->src[idx[i]]->type == dst->type); + GGML_ASSERT(ggml_are_same_shape(dst, dst->src[idx[i]])); + ggml_cuda_set_device(idx[i]); + if (!info.all_ctx[idx[i]]->copy_event) { + CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[idx[i]]->copy_event, cudaEventDisableTiming)); + } + CUDA_CHECK(cudaEventRecord(info.all_ctx[idx[i]]->copy_event, info.all_ctx[idx[i]]->stream())); + } + auto nelem = ggml_nelements(dst); + auto nelem_half = (nelem + 1)/2; + for (int i = 0; i < nhave; ++i) { + ggml_cuda_set_device(idx[i]); + CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[idx[i]]->stream(), info.all_ctx[idx[(i+1)%2]]->copy_event, 0)); + auto this_nelem = std::min(nelem_half, nelem - nelem_half); + int nblock = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE; + if (dst->type == GGML_TYPE_F16) { + auto src_ptr = (half *)dst->src[idx[i]]->data + i*nelem_half; + auto dst_ptr = (half *)dst->src[idx[(i+1)%2]]->data + i*nelem_half; + k_add_sym<<stream()>>>(this_nelem, src_ptr, dst_ptr); + } else { + auto src_ptr = (float *)dst->src[idx[i]]->data + i*nelem_half; + auto dst_ptr = (float *)dst->src[idx[(i+1)%2]]->data + i*nelem_half; + k_add_sym<<stream()>>>(this_nelem, src_ptr, dst_ptr); + } + } + for (int i = 0; i < nhave; ++i) { + ggml_cuda_set_device(idx[i]); + CUDA_CHECK(cudaEventRecord(info.all_ctx[idx[i]]->copy_event, info.all_ctx[idx[i]]->stream())); + ggml_cuda_set_device(idx[(i+1)%2]); + CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[idx[(i+1)%2]]->stream(), info.all_ctx[idx[i]]->copy_event)); + } + ggml_cuda_set_device(ctx.device); + return; + } + int idx[GGML_CUDA_MAX_DEVICES]; + { + int ii = 0; + bool have_this_device = false; + for (int i = 0; i < nreduce; ++i) { + if (dst->src[i]) { + idx[ii++] = i; + if (i == ctx.device) have_this_device = true; + } + } + GGML_ASSERT(ii == nhave); + GGML_ASSERT(have_this_device); + } + if (nhave == 4 && dst->ne[1] <= 8) { + for (int ii = 0; ii < nhave; ++ii) { + int i = idx[ii]; + GGML_ASSERT(dst->src[i]->type == dst->type); + GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i])); + ggml_cuda_set_device(i); + if (!info.all_ctx[i]->copy_event) { + CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming)); + } + } + auto nelem = ggml_nelements(dst); + for (int ii = 0; ii < nhave/2; ++ii) { + int i = idx[2*ii+0]; + ggml_cuda_set_device(i); + int nblocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE; + copy_task task; + task.nptr = nhave/2; + task.nelem = nelem; + task.ptrs[0] = (char *)dst->src[i]->data; + int j = idx[2*ii+1]; + CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream())); + task.ptrs[1] = (char *)dst->src[j]->data; + CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); + if (dst->type == GGML_TYPE_F16) { + k_reduce_add<<stream()>>>(task); + } else { + k_reduce_add<<stream()>>>(task); + } + } + for (int ii = 0; ii < nhave/2; ++ii) { + int i = idx[2*ii+0]; + ggml_cuda_set_device(i); + CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + } + for (int ii = 0; ii < nhave/2; ++ii) { + int i = idx[2*ii+1]; + ggml_cuda_set_device(i); + int nblocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE; + copy_task task; + task.nptr = nhave/2; + task.nelem = nelem; + task.ptrs[0] = (char *)dst->src[i]->data; + int j = idx[(2*ii+2)%nhave]; + task.ptrs[1] = (char *)dst->src[j]->data; + CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); + if (dst->type == GGML_TYPE_F16) { + k_reduce_add<<stream()>>>(task); + } else { + k_reduce_add<<stream()>>>(task); + } + } + for (int ii = 0; ii < nhave/2; ++ii) { + int i = idx[2*ii+1]; + ggml_cuda_set_device(i); + CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + } + for (int ii = 0; ii < nhave/2; ++ii) { + int i = idx[(2*ii+2)%nhave]; + ggml_cuda_set_device(i); + int j = idx[2*ii+1]; + CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); + } + ggml_cuda_set_device(ctx.device); + return; + } + auto required_size = nbytes*(nhave-1); + if (required_size > ctx.copy_size) { + if (ctx.copy_buffer) { + CUDA_CHECK(cudaFree(ctx.copy_buffer)); + } + CUDA_CHECK(ggml_cuda_device_malloc(&ctx.copy_buffer, required_size, ctx.device)); + ctx.copy_size = required_size; + } + auto ptr = (char *)ctx.copy_buffer; + for (int ii = 0; ii < nhave; ++ii) { + int i = idx[ii]; + GGML_ASSERT(dst->src[i]->type == dst->type); + GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i])); + if (i == ctx.device) continue; + ggml_cuda_set_device(i); + CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream())); + if (!info.all_ctx[i]->copy_event) { + CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming)); + } + CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + ptr += nbytes; + } + auto nelem = ggml_nelements(dst); + int num_blocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE; + ggml_cuda_set_device(ctx.device); + ptr = (char *)ctx.copy_buffer; + for (int ii = 0; ii < nhave; ++ii) { + int i = idx[ii]; + if (i == ctx.device) continue; + CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0)); + if (dst->type == GGML_TYPE_F16) { + k_add<<>>(nelem, (const half *)ptr, (half *)dst->data); + } else { + k_add<<>>(nelem, (const float *)ptr, (float *)dst->data); + } + ptr += nbytes; + } + if (!ctx.copy_event) { + CUDA_CHECK(cudaEventCreateWithFlags(&ctx.copy_event, cudaEventDisableTiming)); + } + CUDA_CHECK(cudaEventRecord(ctx.copy_event, ctx.stream())); + for (int ii = 0; ii < nhave; ++ii) { + int i = idx[ii]; + if (i == ctx.device) continue; + ggml_cuda_set_device(i); + CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0)); + CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, info.all_ctx[i]->stream())); + } + ggml_cuda_set_device(ctx.device); +} diff --git a/ggml/src/ggml-cuda/reduce.cuh b/ggml/src/ggml-cuda/reduce.cuh new file mode 100644 index 00000000..2338c2c6 --- /dev/null +++ b/ggml/src/ggml-cuda/reduce.cuh @@ -0,0 +1,7 @@ +#include "common.cuh" + +#define CUDA_REDUCE_BLOCK_SIZE 256 + +void ggml_cuda_op_reduce(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_fake_cpy(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 938d19de..f6a0bdee 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4291,9 +4291,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", "GLU", + + "REDUCE", + "FAKE_CPY", }; -static_assert(GGML_OP_COUNT == 92, "GGML_OP_COUNT != 92"); +static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -4398,10 +4401,13 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss(x,y)", "cross_entropy_loss_back(x,y)", - "glu(x)," + "glu(x),", + + "reduce(x1,x2,...)", + "fake_cpy(x,y)", }; -static_assert(GGML_OP_COUNT == 92, "GGML_OP_COUNT != 92"); +static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -6060,6 +6066,43 @@ struct ggml_tensor * ggml_dup_inplace( return ggml_dup_impl(ctx, a, true); } +struct ggml_tensor * ggml_reduce( + struct ggml_context * ctx, + struct ggml_tensor ** a, + int n, + enum ggml_op op) { + GGML_ASSERT(n > 1 && n <= GGML_MAX_SRC); + GGML_ASSERT(op == GGML_OP_ADD); // currently we only handle reduce_add + struct ggml_tensor * last = NULL; + int nhave = 0; + for (int j = 0; j < n; ++j) { + if (a[j]) { ++nhave; last = a[j]; } + } + GGML_ASSERT(last); + GGML_ASSERT(nhave > 1); + struct ggml_tensor * result = ggml_view_tensor(ctx, last); + for (int j = 0; j < n; ++j) { + result->src[j] = a[j]; + } + result->op = GGML_OP_REDUCE; + result->op_params[0] = (int)op; + result->op_params[1] = n; + result->op_params[2] = nhave; + return result; +} + +struct ggml_tensor * ggml_fake_cpy( + struct ggml_context * ctx, + struct ggml_tensor * dst, + struct ggml_tensor * src) { + struct ggml_tensor * result = ggml_view_tensor(ctx, dst); + result->op = GGML_OP_FAKE_CPY; + result->src[0] = dst; + result->src[1] = src; + return result; +} + + // ggml_add static struct ggml_tensor * ggml_add_impl( @@ -8433,6 +8476,21 @@ struct ggml_tensor * ggml_get_rows( if (a->type == GGML_TYPE_I32) { type = a->type; } + + //if (a->op == GGML_OP_REDUCE) { + // //printf("======================= %s(%s)\n", __func__, a->name); + // struct ggml_tensor * result = NULL; + // for (int j = a->op_params[1]-1; j >= 0; --j) { + // if (a->src[j]) { + // struct ggml_tensor * aj = ggml_get_rows(ctx, a->src[j], b); + // if (result == NULL) result = ggml_view_tensor(ctx, aj); + // result->src[j] = aj; + // } + // } + // GGML_ASSERT(result); + // return result; + //} + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); result->op = GGML_OP_GET_ROWS; @@ -22675,6 +22733,14 @@ static int ggml_compute_forward(struct ggml_compute_params * params, struct ggml #endif switch (tensor->op) { + case GGML_OP_REDUCE: + { + GGML_ABORT("REDUCE not implemented"); + } + case GGML_OP_FAKE_CPY: + { + GGML_ABORT("FAKE_CPY not implemented"); + } case GGML_OP_DUP: { ggml_compute_forward_dup(params, tensor); @@ -23352,6 +23418,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor struct ggml_tensor * src2 = tensor->src[2]; switch (tensor->op) { + case GGML_OP_REDUCE: + { + GGML_ABORT("REDUCE not implemented"); + } + case GGML_OP_FAKE_CPY: + { + GGML_ABORT("FAKE_CPY not implemented"); + } case GGML_OP_DUP: { if (src0->grad) { diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 0c0ef048..f3fc0caf 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -620,6 +620,20 @@ ggml_tensor * llm_build_context::llm_build_norm( return cur; } +static ggml_tensor * get_input_tensor_sm_graph(ggml_tensor * input, int id) { + auto cur = input; + if (input->op == GGML_OP_REDUCE) { + auto view_src = input->view_src; + GGML_ASSERT(view_src); + cur = input->src[id]; + if (cur == view_src || !cur) { + //printf("%s: Setting input to %s for id = %d\n", __func__, view_src->name, id); + cur = input; + } + } + return cur; +} + ggml_tensor * llm_build_context::llm_build_ffn( ggml_context * ctx, llama_context & lctx, @@ -637,19 +651,21 @@ ggml_tensor * llm_build_context::llm_build_ffn( ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, - const llm_build_cb & cb, int il, ggml_cgraph * graph) { + const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input, + bool is_norm, ggml_tensor * add_extra) { if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s && up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) { + //printf("%s: %s\n", __func__, ggml_op_name(input->op)); auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU; auto u = (ggml_split_tensor_t *)up->extra; auto g = (ggml_split_tensor_t *)gate->extra; auto d = (ggml_split_tensor_t *)down->extra; GGML_ASSERT(u->n_device == g->n_device && u->n_device == d->n_device); - std::vector ffn; - ffn.reserve(u->n_device); + std::vector ffn(u->n_device, nullptr); + int id_last = -1; for (int id = 0; id < u->n_device; ++id) { int il_cb = 1000*(id+1) + il; auto split_u = u->splits[id]; @@ -657,15 +673,21 @@ ggml_tensor * llm_build_context::llm_build_ffn( auto split_d = d->splits[id]; GGML_ASSERT((!split_u && !split_g && !split_d) || (split_u && split_g && split_d)); if (!split_u) continue; - auto cur = input; + auto cur = get_input_tensor_sm_graph(input, id); if (ffn_norm && ffn_norm->extra) { auto norm = (ggml_split_tensor_t *)ffn_norm->extra; GGML_ASSERT(norm->splits[id]); - cur = llm_build_norm(ctx, input, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il); + if (is_norm) { + cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM, cb, il); + GGML_ASSERT(cur->src[0]->op == GGML_OP_NORM); + cur->src[0]->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff; + } else { + cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il); + } cb(cur, "ffn_inp_normed", il_cb); } - else if (input->type != GGML_TYPE_F32) { - cur = ggml_cast(ctx, input, GGML_TYPE_F32); + else if (cur->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F32); } cur = ggml_fused_up_gate(ctx, split_u, split_g, cur, unary_op); cb(cur, "ffn_up_gate", il_cb); @@ -681,32 +703,31 @@ ggml_tensor * llm_build_context::llm_build_ffn( if (graph) { ggml_build_forward_expand(graph, cur); } - ffn.push_back(cur); + ffn[id] = cur; + id_last = id; } - if (ffn.size() == 1) return ffn.front(); - auto cur = ggml_add(ctx, ffn[0], ffn[1]); - cb(cur, "combine_ffn", il); - cur->op_params[0] = 0xff; - for (int id = 2; id < int(ffn.size()); ++id) { - cur = ggml_add(ctx, cur, ffn[id]); - cb(cur, "combine_ffn", il); + GGML_ASSERT(id_last >= 0); + if (add_input) { + ffn[id_last] = ggml_add(ctx, ffn[id_last], input); + cb(ffn[id_last], "ffn_with_inp", il); } - if (ffn.size() > 2) { - cur->op_params[0] = 0xff; + if (add_extra) { + ffn[id_last] = ggml_add(ctx, ffn[id_last], add_extra); + cb(ffn[id_last], "ffn_with_inp", il); } - //if (cur->type != GGML_TYPE_F32) { - // cur = ggml_cast(ctx, cur, GGML_TYPE_F32); - //} - + auto cur = ggml_reduce(ctx, ffn.data(), u->n_device, GGML_OP_ADD); + cb(cur, "ffn_combined", il); + ggml_build_forward_expand(graph, cur); return cur; } + auto cur = input; if (ffn_norm) { - input = llm_build_norm(ctx, input, lctx.model.hparams, ffn_norm, NULL, LLM_NORM_RMS, cb, il); + cur = llm_build_norm(ctx, cur, lctx.model.hparams, ffn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il); cb(input, "ffn_norm", il); } - else if (input->type != GGML_TYPE_F32) { - input = ggml_cast(ctx, input, GGML_TYPE_F32); + if (cur->type != GGML_TYPE_F32) { + cur = ggml_cast(ctx, cur, GGML_TYPE_F32); } if (lctx.cparams.fused_up_gate && @@ -714,7 +735,7 @@ ggml_tensor * llm_build_context::llm_build_ffn( (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) { auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU; - auto cur = ggml_fused_up_gate(ctx, up, gate, input, unary_op); + cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op); cb(cur, "ffn_up_gate", il); if (down) { cur = llm_build_lora_mm(lctx, ctx, down, cur); @@ -733,10 +754,18 @@ ggml_tensor * llm_build_context::llm_build_ffn( cur = ggml_mul(ctx, cur, down_s); cb(cur, "ffn_down_s", il); } + if (add_input) { + cur = ggml_add(ctx, cur, input); + cb(cur, "ffn_out_with_inp", il); + } + if (add_extra) { + cur = ggml_add(ctx, cur, add_extra); + cb(cur, "ffn_out_with_inp", il); + } return cur; } - struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, input) : input; + struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -749,7 +778,6 @@ ggml_tensor * llm_build_context::llm_build_ffn( cb(tmp, "ffn_up_s", il); } - auto cur = input; if (gate) { switch (type_gate) { case LLM_FFN_SEQ: @@ -849,6 +877,15 @@ ggml_tensor * llm_build_context::llm_build_ffn( cb(cur, "ffn_down_s", il); } + if (add_input) { + cur = ggml_add(ctx, cur, input); + cb(cur, "ffn_out_with_inp", il); + } + if (add_extra) { + cur = ggml_add(ctx, cur, add_extra); + cb(cur, "ffn_out_with_inp", il); + } + return cur; } @@ -868,7 +905,9 @@ ggml_tensor * llm_build_context::llm_build_moe_ffn( bool scale_w, float w_scale, llm_expert_gating_func_type gating_op, - const llm_build_cb & cb, int il, ggml_cgraph * graph) { + const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) { + + auto input = cur; int64_t n_embd = cur->ne[0]; int64_t n_tokens = cur->ne[1]; @@ -1040,20 +1079,30 @@ llm_expert_gating_func_type gating_op, if (lctx.cparams.fused_mmad) { experts = ggml_mul_multi_add(ctx, experts, weights); cb(experts, "ffn_moe_weighted", il); + if (add_input) { + experts = ggml_add(ctx, experts, input); + cb(experts, "ffn_out_with_inp", il); + } return experts; } experts = ggml_mul(ctx, experts, weights); cb(experts, "ffn_moe_weighted", il); } + ggml_tensor * result; if (n_expert_used == 1) { - return ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0)); + result = ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0)); } if (n_expert_used == 2) { - return ggml_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), + result = ggml_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], experts->nb[1])); } - return ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used); + result = ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used); + if (add_input) { + cb(result, "ffn_out", il); + result = ggml_add(ctx, result, input); + } + return result; } @@ -1076,7 +1125,7 @@ ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama float w_scale, llm_expert_gating_func_type gating_op, llm_ffn_op_type type_op_shexp, - const llm_build_cb & cb, int il, ggml_cgraph * graph) { + const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) { auto split_up_exps = (ggml_split_tensor_t *)up_exps->extra; auto split_gate_exps = (ggml_split_tensor_t *)gate_exps->extra; @@ -1092,10 +1141,10 @@ llm_expert_gating_func_type gating_op, if (ffn_norm) { auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm; GGML_ASSERT(the_ffn_norm); - cur = llm_build_norm(ctx, input, lctx.model.hparams, the_ffn_norm, nullptr, LLM_NORM_RMS, cb, il); + cur = llm_build_norm(ctx, cur, lctx.model.hparams, the_ffn_norm, nullptr, LLM_NORM_RMS, cb, il); cb(cur, "ffn_inp_normed", il); } - else if (cur->type != GGML_TYPE_F32) { + if (cur->type != GGML_TYPE_F32) { cur = ggml_cast(ctx, cur, GGML_TYPE_F32); } auto the_gate_inp = gate_inp->extra ? ((ggml_split_tensor_t *)gate_inp->extra)->splits[lctx.model.main_gpu] : gate_inp; @@ -1110,8 +1159,12 @@ llm_expert_gating_func_type gating_op, the_exp_probs_b, n_expert, n_expert_used, type_op, norm_w, scale_w, w_scale, - gating_op, cb, il, graph); + gating_op, cb, il, graph, false); cb(routed_out, "routed_out", il); + if (add_input) { + routed_out = ggml_add(ctx, routed_out, input); + cb(routed_out, "routed_out_with_inp", il); + } ggml_build_forward_expand(graph, routed_out); if (up_shexp && gate_shexp && down_shexp) { @@ -1176,26 +1229,27 @@ llm_expert_gating_func_type gating_op, } GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps); GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device); - std::vector results; results.reserve(split_up_exps->n_device); + std::vector results(split_up_exps->n_device, nullptr); GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) || ( split_up_shexp && split_gate_shexp && split_down_shexp)); auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra; GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device); auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr; GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device); + int last_id = -1; for (int id = 0; id < split_up_exps->n_device; ++id) { GGML_ASSERT((split_up_exps->splits[id] && split_gate_exps->splits[id] && split_down_exps->splits[id]) || (!split_up_exps->splits[id] && !split_gate_exps->splits[id] && !split_down_exps->splits[id])); if (!split_up_exps->splits[id]) continue; int il_cb = 1000*(id + 1) + il; - auto cur = input; + auto cur = get_input_tensor_sm_graph(input, id); if (ffn_norm) { auto split_ffn_norm = (ggml_split_tensor_t *)ffn_norm->extra; GGML_ASSERT(split_ffn_norm && split_ffn_norm->n_device == split_up_exps->n_device); - cur = llm_build_norm(ctx, input, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il); + cur = llm_build_norm(ctx, cur, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il); cb(cur, "ffn_inp_normed", il_cb); } - else if (cur->type != GGML_TYPE_F32) { + if (cur->type != GGML_TYPE_F32) { cur = ggml_cast(ctx, cur, GGML_TYPE_F32); } auto routed_out = llm_build_moe_ffn(ctx, lctx, cur, @@ -1206,7 +1260,7 @@ llm_expert_gating_func_type gating_op, split_exp_probs_b ? split_exp_probs_b->splits[id] : nullptr, n_expert, n_expert_used, type_op, norm_w, scale_w, w_scale, - gating_op, cb, il, graph); + gating_op, cb, il, graph, false); cb(routed_out, "routed_out", il_cb); if (split_up_shexp) { @@ -1229,19 +1283,20 @@ llm_expert_gating_func_type gating_op, cur = ggml_cast(ctx, cur, GGML_TYPE_F16); cb(cur, "ffn_out_f16", il_cb); } - ggml_build_forward_expand(graph, routed_out); - results.push_back(cur); + ggml_build_forward_expand(graph, cur); + results[id] = cur; + last_id = id; + } + GGML_ASSERT(last_id >= 0); + if (add_input) { + results[last_id] = ggml_add(ctx, results[last_id], input); + cb(results[last_id], "ffn_inp_added", il); } - GGML_ASSERT(!results.empty()); - if (results.size() == 1) return results.front(); - auto cur = ggml_add(ctx, results[0], results[1]); - cur->op_params[0] = 0xff; - cb(cur, "ffn_combined", il); - for (int id = 2; id < int(results.size()); ++id) { - cur = ggml_add(ctx, cur, results[id]); - cb(cur, "ffn_combined", il); - } + auto cur = ggml_reduce(ctx, results.data(), split_up_exps->n_device, GGML_OP_ADD); + cb(cur, "moe_ffn_combined", il); + ggml_build_forward_expand(graph, cur); + return cur; } @@ -1754,7 +1809,7 @@ ggml_cgraph * llm_build_context::build_llama() { // self-attention if (use_rope) { cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, - this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il); + this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il, true, false, true); } else { @@ -1801,15 +1856,18 @@ ggml_cgraph * llm_build_context::build_llama() { Kcur, Vcur, Qcur, this_KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il, nullptr, this_n_swa); } + //printf("%s: attn result for layer %d is %s, %s\n", __func__, il, cur->name, ggml_op_name(cur->op)); if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); cb(cur, "last_attn", il); - cb(inpSA, "last_ffn_inp", il); + if (!use_rope) { + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + cb(inpSA, "last_ffn_inp", il); + } } // For Granite architecture @@ -1818,8 +1876,13 @@ ggml_cgraph * llm_build_context::build_llama() { cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + ggml_tensor * ffn_inp; + if (use_rope) { + ffn_inp = cur; + } else { + ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + } // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { @@ -1829,7 +1892,7 @@ ggml_cgraph * llm_build_context::build_llama() { model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf, true); cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_LLAMA4) { // llama4 MoE @@ -1846,7 +1909,7 @@ ggml_cgraph * llm_build_context::build_llama() { LLM_FFN_SILU, false, false, 0.0, LLM_EXPERT_GATING_FUNC_SIGMOID, - cb, il, gf); + cb, il, gf, true); // Shared experts ggml_tensor * shexp_out = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp_normed, @@ -1875,9 +1938,10 @@ ggml_cgraph * llm_build_context::build_llama() { LLM_FFN_SILU, true, false, 0.0, LLM_EXPERT_GATING_FUNC_SOFTMAX, - cb, il, gf); + cb, il, gf, true); cb(cur, "ffn_moe_out", il); } + //printf("%s: ffn result for layer %d is %s, %s\n", __func__, il, cur->name, ggml_op_name(cur->op)); // For Granite architecture if (hparams.f_residual_scale) { @@ -1885,8 +1949,8 @@ ggml_cgraph * llm_build_context::build_llama() { cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + //cur = ggml_add(ctx0, cur, ffn_inp); + //cb(cur, "ffn_out", il); cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); @@ -3933,23 +3997,26 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + //struct ggml_tensor * inpSA = inpL; // norm //cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); //cb(cur, "attn_norm", il); - cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il); + cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, + il, true, false, true); + //printf("%s: attn = %s(%s)\n", __func__, cur->name, ggml_op_name(cur->op)); if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + //inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + auto ffn_inp = cur; + //struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + //cb(ffn_inp, "ffn_inp", il); cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, model.layers[il].ffn_gate_inp, nullptr, @@ -3963,9 +4030,11 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0f, LLM_EXPERT_GATING_FUNC_SOFTMAX, - LLM_FFN_SILU, cb, il, gf); + LLM_FFN_SILU, cb, il, gf, true); - cur = ggml_add(ctx0, cur, ffn_inp); + //printf("%s: ffn = %s(%s)\n", __func__, cur->name, ggml_op_name(cur->op)); + + //cur = ggml_add(ctx0, cur, ffn_inp); cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); @@ -6818,7 +6887,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { // self-attention if (rope_cache == nullptr) { - cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il); + cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true); } else { // Pre-attention norm cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); @@ -6862,8 +6931,13 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { } // residual connection for attention output - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + ggml_tensor * ffn_inp; + if (rope_cache) { + ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + } else { + ffn_inp = cur; + } if ((uint32_t) il < hparams.n_layer_dense_lead) { // dense FFN @@ -6872,7 +6946,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf, true); cb(cur, "ffn_out", il); } else { cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, @@ -6887,39 +6961,11 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, true, hparams.expert_weights_scale, (llm_expert_gating_func_type) hparams.expert_gating_func, - LLM_FFN_SILU, cb, il, gf); - - //// Post-attention norm - //cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - //cb(cur, "post_attn_norm", il); - //// MoE FFN - //auto routed_out = llm_build_moe_ffn(ctx0, lctx, cur, - // model.layers[il].ffn_gate_inp, - // model.layers[il].ffn_up_exps, - // model.layers[il].ffn_gate_exps, - // model.layers[il].ffn_down_exps, - // model.layers[il].ffn_exp_probs_b, - // n_expert, n_expert_used, - // LLM_FFN_SILU, hparams.expert_weights_norm, - // true, hparams.expert_weights_scale, - // (enum llm_expert_gating_func_type) hparams.expert_gating_func, - // cb, il, gf); - //cb(routed_out, "routed_out", il); - - //auto shared_out = llm_build_ffn(ctx0, lctx, nullptr, cur, - // model.layers[il].ffn_up_shexp, NULL, NULL, - // model.layers[il].ffn_gate_shexp, NULL, NULL, - // model.layers[il].ffn_down_shexp, NULL, NULL, - // NULL, - // LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - //cb(shared_out, "ffn_shexp_out", il); - - //cur = ggml_add(ctx0, routed_out, shared_out); - //cb(cur, "ffn_out", il); + LLM_FFN_SILU, cb, il, gf, true); } // residual and context vector - cur = ggml_add(ctx0, cur, ffn_inp); + //cur = ggml_add(ctx0, cur, ffn_inp); cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); @@ -7229,48 +7275,25 @@ ggml_cgraph * llm_build_context::build_cohere2() { const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; - // norm - auto attn_norm = model.layers[il].attn_norm; - int id = -1; - if (attn_norm->extra) { - auto extra = (ggml_split_tensor_t *)attn_norm->extra; - for (int i = extra->n_device-1; i >= 0; --i) { - if (extra->splits[i]) { - attn_norm = extra->splits[i]; - id = i; - break; - } - } - } - cur = llm_build_norm(ctx0, inpL, hparams, attn_norm, NULL, LLM_NORM, cb, il); - if (id >= 0) { - ggml_backend_sched_set_tensor_backend(lctx.sched, cur->src[0], ggml_backend_sched_get_backend(lctx.sched, id)); - } - cb(cur, "attn_norm", il); - auto ffn_inp = cur; - // self-attention - auto attn_out = build_std_attention(gf, nullptr, cur, inp_pos, nullptr, KQ_mask_l, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), 0.f, - is_sliding ? hparams.n_swa : 0, il, is_sliding, true); + auto attn_out = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask_l, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), 0.f, + is_sliding ? hparams.n_swa : 0, il, is_sliding, false, true, true); cb(attn_out, "attn_out", il); if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); attn_out = ggml_get_rows(ctx0, attn_out, inp_out_ids); - ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // feed-forward network - cur = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, + cur = llm_build_ffn(ctx0, lctx, model.layers[il].attn_norm, inpL, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, - cb, il, gf); + cb, il, gf, false, true, attn_out); cb(cur, "ffn_out", il); // add together residual + FFN + self-attention - cur = ggml_add(ctx0, cur, attn_out); - cur = ggml_add(ctx0, cur, inpL); cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); @@ -7279,9 +7302,6 @@ ggml_cgraph * llm_build_context::build_cohere2() { } cur = inpL; - //if (cur->type != GGML_TYPE_F32) { - // cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); - //} cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1); cb(cur, "result_norm", -1); @@ -9312,10 +9332,11 @@ ggml_cgraph * llm_build_context::llama_build_graph( ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tensor * the_attn_norm, ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in, ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale, - int n_swa, int il, bool do_rope, bool add_graph_split) { + int n_swa, int il, bool do_rope, bool add_graph_split, bool add_input, bool is_norm) { if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn && model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) { if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) { + //printf("%s: %s\n", __func__, ggml_op_name(input->op)); ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr; auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra; auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra; @@ -9342,7 +9363,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens bv = (ggml_split_tensor_t *)model.layers[il].bv->extra; GGML_ASSERT(bv->n_device == wq->n_device); } - std::vector attn; attn.reserve(wq->n_device); + std::vector attn(wq->n_device, nullptr); + int id_last = -1; for (int id = 0; id < wq->n_device; ++id) { int il_cb = 1000*(id+1) + il; auto split_wq = wq->splits[id]; @@ -9354,13 +9376,22 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens GGML_ASSERT((!split_wq && !split_wk && !split_wv && !split_wo && !split_kl && !split_vl) || (split_wq && split_wk && split_wv && split_wo && split_kl && split_vl)); if (!split_wq) continue; - auto cur = input; + auto cur = get_input_tensor_sm_graph(input, id); if (attn_norm) { - auto split_norm = attn_norm->splits[id]; - cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il_cb); + if (is_norm) { + cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM, cb, il); + GGML_ASSERT(cur->src[0]->op == GGML_OP_NORM); + cur->src[0]->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff; + } else { + cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM_RMS, cb, il); + } } - else if (cur->type != GGML_TYPE_F32) { + //if (attn_norm) { + // auto split_norm = attn_norm->splits[id]; + // cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il); + // cb(cur, "attn_norm", il_cb); + //} + if (cur->type != GGML_TYPE_F32) { cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); } auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ? @@ -9486,42 +9517,24 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens cur = ggml_cast(ctx0, cur, GGML_TYPE_F16); } ggml_build_forward_expand(gf, cur); - attn.push_back(cur); + attn[id] = cur; + id_last = id; } - GGML_ASSERT(!attn.empty()); - if (attn.size() == 1) return attn.front(); - //if (attn.size() > 2 && attn.size()%2 == 0) { - // for (int id = 0; id < int(attn.size()/2); ++id) { - // attn[id] = ggml_add(ctx0, attn[2*id+0], attn[2*id+1]); - // attn[id]->op_params[0] = 0xff; - // } - // attn.resize(attn.size()/2); - // auto cur = ggml_add(ctx0, attn[0], attn[1]); - // cur->op_params[0] = 0xff; - // cur->op_params[0] = 0xff; - // for (int id = 2; id < (int)attn.size(); ++id) { - // cur = ggml_add(ctx0, cur, attn[id]); - // cb(cur, "combine_attn", il); - // } - // return cur; - //} - auto cur = ggml_add(ctx0, attn[0], attn[1]); - cb(cur, "combine_attn", il); - cur->op_params[0] = 0xff; - for (int id = 2; id < (int)attn.size(); ++id) { - cur = ggml_add(ctx0, cur, attn[id]); - cb(cur, "combine_attn", il); - } - if (attn.size() > 2) { - cur->op_params[0] = 0xff; + GGML_ASSERT(id_last >= 0); + if (add_input) { + attn[id_last] = ggml_add(ctx0, attn[id_last], input); + cb(attn[id_last], "attn_out_with_input", il); } + auto cur = ggml_reduce(ctx0, attn.data(), wq->n_device, GGML_OP_ADD); + ggml_build_forward_expand(gf, cur); + cb(cur, "attn_combined", il); return cur; } } auto cur = input; if (the_attn_norm) { - cur = llm_build_norm(ctx0, cur, hparams, the_attn_norm, NULL, LLM_NORM_RMS, cb, il); + cur = llm_build_norm(ctx0, cur, hparams, the_attn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); } @@ -9549,5 +9562,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa); + if (add_input) { + cb(cur, "attn_out", il); + cur = ggml_add(ctx0, cur, input); + } + return cur; } diff --git a/src/llama-build-context.h b/src/llama-build-context.h index 347b177a..498c3a5d 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -335,7 +335,8 @@ struct llm_build_context { ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, - const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr); + const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false, + bool is_norm = false, ggml_tensor * add_extra = nullptr); static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx, ggml_tensor * cur, @@ -351,7 +352,7 @@ struct llm_build_context { bool scale_w, float w_scale, llm_expert_gating_func_type gating_op, - const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr); + const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false); static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx, ggml_tensor * cur, @@ -367,7 +368,7 @@ llm_expert_gating_func_type gating_op, bool scale_w, float w_scale, llm_expert_gating_func_type gating_op, - const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr) { + const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false) { return llm_build_moe_ffn(ctx, lctx, cur, gate_inp, nullptr, up_exps, nullptr, @@ -376,7 +377,7 @@ llm_expert_gating_func_type gating_op, exp_probs_b, n_expert, n_expert_used, type_op, norm_w, scale_w, w_scale, - gating_op, cb, il, graph); + gating_op, cb, il, graph, add_input); } static ggml_tensor * llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx, @@ -398,7 +399,7 @@ llm_expert_gating_func_type gating_op, float w_scale, llm_expert_gating_func_type gating_op, llm_ffn_op_type type_op_shexp, - const llm_build_cb & cb, int il, ggml_cgraph * graph); + const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input = false); static ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids); @@ -410,6 +411,6 @@ llm_expert_gating_func_type gating_op, ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * attn_norm, ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * rope_factors, ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale, - int n_swa, int il, bool do_rope = true, bool add_graph_split = false); + int n_swa, int il, bool do_rope = true, bool add_graph_split = false, bool add_input = false, bool is_norm = false); };