Graph parallel: the next generation (#1080)

* WIP: absorb adding input into std_attn and std_ffn

* WIP: NCCL infra

* WIP: add reduce and fake_cpy ops

* WIP

* WIP: graph appears to work, layer is broken

* WIP: Qwen3-MoE works with graph, layer still broken

* WIP: GLM-4.5 graph works

* WIP: fix sm layer (dense)

* WIP: fix sm layer (MoE)

* WIP: fast PP with bespoke 4-GPU NCCL

I guess, I'm not using NCCL the right way as PP is very
low with a single communicator group for 3 or more GPUs.
But if I create 4 communicator groups for pairs of GPUs
(0,1, 2,3, 0,2, 1,3) and use that, PP is fast: I'm hitting
1500 t/s for L3-70B on the 4x3090 system, which is
~20% better than the previous sm graph without NCCL.
But that cannot be the solution (I cannot be creating pairwise
communicators and associated logic for every possible number of GPUs).

* WIP: Cohere2

* Explicitely set device

* Bespoke 3-GPU case

* WIP

* Do not repeat get_rows multiple times

* Fix 3 GPUs

* OK, let's leave it in

* Implement the reduce op without NCCL available

* Be able to build without NCCL

cmake -DGGML_NCCL=OFF disables it

* Make --max-gpu work again

* Slightly better for 4 GPUs without NCCL

* Cleanup

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-12-24 08:31:48 +01:00
committed by GitHub
parent 2a633c4357
commit 0d7eb34185
12 changed files with 870 additions and 256 deletions

90
cmake/FindNCCL.cmake Normal file
View File

@@ -0,0 +1,90 @@
# Find the nccl libraries
#
# The following variables are optionally searched for defaults
# NCCL_ROOT: Base directory where all NCCL components are found
# NCCL_INCLUDE_DIR: Directory where NCCL header is found
# NCCL_LIB_DIR: Directory where NCCL library is found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIRS
# NCCL_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NCCL in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601
set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
if ($ENV{NCCL_ROOT_DIR})
message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
endif()
list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
find_path(NCCL_INCLUDE_DIRS
NAMES nccl.h
HINTS ${NCCL_INCLUDE_DIR})
if (USE_STATIC_NCCL)
MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
SET(NCCL_LIBNAME "nccl_static")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
else()
SET(NCCL_LIBNAME "nccl")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif()
find_library(NCCL_LIBRARIES
NAMES ${NCCL_LIBNAME}
HINTS ${NCCL_LIB_DIR})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
if(NCCL_FOUND) # obtaining NCCL version and some sanity checks
set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
if (NCCL_VERSION_DEFINED)
set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
file(WRITE ${file} "
#include <iostream>
#include <nccl.h>
int main()
{
std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
int x;
ncclGetVersion(&x);
return x == NCCL_VERSION_CODE;
}
")
try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
LINK_LIBRARIES ${NCCL_LIBRARIES})
if (NOT NCCL_VERSION_MATCHED)
message(FATAL_ERROR "Found NCCL header version and library version do not match! \
(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
endif()
message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
else()
message(STATUS "NCCL version < 2.3.5-5")
endif ()
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
endif()

View File

@@ -97,6 +97,7 @@ endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_SVE "ggml: enable SVE" OFF)
option(GGML_NCCL "ggml: enable NCCL" ON)
if (WIN32)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")

View File

@@ -689,6 +689,9 @@ extern "C" {
GGML_OP_GLU,
GGML_OP_REDUCE,
GGML_OP_FAKE_CPY,
GGML_OP_COUNT,
};
@@ -3034,6 +3037,17 @@ extern "C" {
struct ggml_tensor ** splits;
} ggml_split_tensor_t;
GGML_API struct ggml_tensor * ggml_reduce(
struct ggml_context * ctx,
struct ggml_tensor ** a,
int n,
enum ggml_op op);
GGML_API struct ggml_tensor * ggml_fake_cpy(
struct ggml_context * ctx,
struct ggml_tensor * dst,
struct ggml_tensor * src);
#ifdef __cplusplus
}
#endif

View File

@@ -462,6 +462,21 @@ if (GGML_CUDA)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
endif()
if (GGML_NCCL)
find_package(NCCL)
if (NCCL_FOUND)
message("==================== NCCL found!")
message("NCCL_LIBRARIES = ${NCCL_LIBRARIES}")
message("NCCL_INCLUDE_DIRS = ${NCCL_INCLUDE_DIRS}")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${NCCL_LIBRARIES})
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${NCCL_INCLUDE_DIRS})
add_compile_definitions(GGML_USE_NCCL)
else()
message("==================== NCCL NOT found -> building wihout NCCL support")
endif()
endif()
if (NOT GGML_MUSA)
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0)
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES 0)

View File

@@ -1414,13 +1414,59 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// do not overwrite user assignments
if (*leaf_backend_id == -1) {
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
//printf("Pass 1: assigned backend %d to leaf %d, %s\n", *leaf_backend_id, i, graph->leafs[i]->name);
}
}
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
int * node_backend_id = &tensor_backend_id(node);
if (node->op == GGML_OP_REDUCE) {
auto view_src = node->view_src;
int src_id = -1;
for (int j = 0; j < node->op_params[1]; ++j) {
if (node->src[j]) {
int * this_node_backend_id = &tensor_backend_id(node->src[j]);
if (*this_node_backend_id == -1) {
*this_node_backend_id = j;
} else {
GGML_ASSERT(*this_node_backend_id == j);
}
if (view_src == node->src[j]) {
src_id = j;
}
}
}
if (src_id >= 0) {
int * this_node_backend_id = &tensor_backend_id(view_src);
*this_node_backend_id = tensor_backend_id(node->src[src_id]);
*node_backend_id = *this_node_backend_id;
}
}
else if (node->op == GGML_OP_MUL && node->src[0]->op == GGML_OP_NORM) {
// This is a hack for Cohere2. Without this hack the scheduler creates
// totally nonsensical splits for that arch
int * src1_id = &tensor_backend_id(node->src[1]);
if (*src1_id >= 0) {
int * src0_id = &tensor_backend_id(node->src[0]);
int * dst_id = &tensor_backend_id(node);
*src0_id = *src1_id;
*dst_id = *src1_id;
// For some reason that I don't understand, we can have norm backend already assigned
// at this point. How? That's why this more logical approach of first checking is commented out
//if (*src0_id < 0) {
// *src0_id = *src1_id;
//} else {
// printf("Oops: backend_id_src0(%s) = %d, backend_id_src1(%s) = %d\n", node->src[0]->name, *src0_id, node->src[1]->name, *src1_id);
// //GGML_ASSERT(*src0_id == *src1_id);
//}
//if (*dst_id < 0) {
// *dst_id = *src1_id;
//} else {
// printf("Oops: backend_id_dst(%s) = %d, backend_id_src1(%s) = %d\n", node->name, *dst_id, node->src[1]->name, *src1_id);
// //GGML_ASSERT(*dst_id == *src1_id);
//}
}
}
// do not overwrite user assignments
if (*node_backend_id == -1) {
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
@@ -1652,6 +1698,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// check if we should start a new split based on the sources of the current node
bool need_new_split = false;
if ((node->op == GGML_OP_ADD && node->op_params[0] == 0xff) ||
node->op == GGML_OP_REDUCE ||
node->op == GGML_OP_FAKE_CPY ||
node->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] == 0xff) {
need_new_split = true;
}
@@ -1739,6 +1787,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
// create a copy of the input in the split's backend
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
if (node->op == GGML_OP_REDUCE) {
//printf("setting tensor_id_copy(reduce, %zu, %d, %s) to %s\n", src_id, cur_backend_id, node->name, src->name);
tensor_id_copy(src_id, cur_backend_id, 0) = src;
} else if (node->op == GGML_OP_FAKE_CPY && src->op == GGML_OP_REDUCE) {
//printf("setting tensor_id_copy(fake_cpy, %zu, %d, %s) to %s\n", src_id, cur_backend_id, node->name, src->src[j]->name);
tensor_id_copy(src_id, cur_backend_id, 0) = src->src[j];
} else {
ggml_backend_t backend = sched->backends[cur_backend_id];
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@@ -1753,6 +1808,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
int n_inputs = split->n_inputs++;
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
split->inputs[n_inputs] = src;
}
}
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
}
@@ -2027,80 +2083,8 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
}
}
static ggml_status ggml_backend_sched_compute_splits_sm_graph(ggml_backend_sched_t sched) {
std::vector<int32_t> ids;
std::vector<uint32_t> unique_ids;
ggml_tensor * last_ids_tensor = nullptr;
std::array<bool, GGML_SCHED_MAX_BACKENDS> needs_sync{{true}};
auto splits = sched->splits;
std::vector<ggml_backend_sched_split *> this_split;
for (int i = 0; i < sched->n_splits; ++i) {
auto split_i = &splits[i];
this_split.clear();
this_split.push_back(split_i);
for (int j = i+1; j < sched->n_splits; ++j) {
auto split_j = &splits[j];
if (split_i->backend_id == split_j->backend_id) {
break;
}
int n_nodes = std::min(split_i->graph.n_nodes, split_j->graph.n_nodes);
bool same = true;
for (int k = 0; k < n_nodes; ++k) {
if (split_i->graph.nodes[k]->op != split_j->graph.nodes[k]->op) {
same = false; break;
}
}
if (!same) {
break;
}
this_split.push_back(split_j);
}
if (false) {
auto split = this_split.front();
if (this_split.size() == 1) {
printf("=== Split %d with %d inputs on backend %d\n", i, split->n_inputs, split->backend_id);
} else {
printf("=== Split %d with %d inputs on backends", i, split->n_inputs);
for (int j = 0; j < (int)this_split.size(); ++j) printf(" %d", this_split[j]->backend_id);
printf("\n");
}
for (int j = 0; j < split->graph.n_nodes; ++j) {
printf(" %d %s(%s)\n", j, ggml_op_name(split->graph.nodes[j]->op), split->graph.nodes[j]->name);
}
}
for (auto split : this_split) {
ggml_backend_sched_copy_inputs(sched, split, needs_sync, ids, unique_ids, last_ids_tensor);
}
for (auto split : this_split) {
auto split_backend_id = split->backend_id;
if (split->n_inputs > 0) {
needs_sync[split_backend_id] = true;
}
auto split_backend = sched->backends[split_backend_id];
auto ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
if (ec != GGML_STATUS_SUCCESS) {
return ec;
}
if (split->n_inputs > 0) {
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
}
}
}
i += this_split.size() - 1;
}
return GGML_STATUS_SUCCESS;
}
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
if (false && sched->split_mode_graph) {
return ggml_backend_sched_compute_splits_sm_graph(sched);
}
std::array<bool, GGML_SCHED_MAX_BACKENDS> needs_sync{{true}};
std::array<bool, GGML_SCHED_MAX_BACKENDS> own_cpy{{false}};

View File

@@ -48,6 +48,7 @@
#include "ggml-cuda/argmax.cuh"
#include "ggml-cuda/multiadd.cuh"
#include "ggml-cuda/hadamard.cuh"
#include "ggml-cuda/reduce.cuh"
#include <algorithm>
#include <array>
@@ -143,7 +144,7 @@ int ggml_cuda_get_device() {
return id;
}
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device);
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
auto res = hipMallocManaged(ptr, size);
@@ -246,6 +247,42 @@ static ggml_cuda_device_info ggml_cuda_init() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
#ifdef GGML_USE_NCCL
info.have_nccl = false;
if (info.device_count > 1) {
int gpu_list[GGML_CUDA_MAX_DEVICES];
for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
if (status == ncclSuccess) {
printf("=============================== NCCL main communicator initialized\n");
info.have_nccl = true;
} else {
printf("=============================== NCCL initialization failed with status %d\n", int(status));
GGML_ABORT("Fatal error");
}
auto com = info.nccl_coms + info.device_count;
if (info.device_count == 4) {
int devs[8] = {0,1, 2,3, 0,2, 1,3};
auto com = info.nccl_coms + info.device_count;
for (int ip = 0; ip < 4; ++ip) {
if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) {
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
GGML_ABORT("Fatal error");
}
}
printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
} else if (info.device_count == 3) {
int devs[4] = {0,1, 0,2};
for (int ip = 0; ip < 2; ++ip) {
if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) {
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
GGML_ABORT("Fatal error");
}
}
printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
}
}
#endif
return info;
}
@@ -465,6 +502,11 @@ static std::atomic<int> ggml_cuda_lock_counter;
ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
if (info->all_ctx[device]) {
GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device);
}
info->all_ctx[device] = this;
}
ggml_backend_cuda_context::~ggml_backend_cuda_context() {
@@ -472,6 +514,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
std::unique_lock<std::mutex> lock(ggml_cuda_lock);
ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
info->all_ctx[this->device] = nullptr;
if (copy_event != nullptr) {
CUDA_CHECK(cudaEventDestroy(copy_event));
}
@@ -2934,6 +2979,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
//printf("%4d %s(%s) on device %d. time = %ld\n", i, ggml_op_name(dst->op), dst->name, ctx.device, ggml_time_us());
switch (dst->op) {
case GGML_OP_REDUCE:
ggml_cuda_op_reduce(ctx, dst);
break;
case GGML_OP_FAKE_CPY:
break;
case GGML_OP_ARGMAX:
ggml_cuda_argmax(ctx, dst);
break;
@@ -3451,8 +3501,23 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
needs_f16_f32_copy = true;
} else {
#ifdef GGML_USE_NCCL__
auto & info = ggml_cuda_info();
auto nbytes = ggml_nbytes(src);
ncclGroupStart();
ggml_cuda_set_device(cuda_ctx_src->device);
auto status1 = ncclSend(src->data, nbytes, ncclUint8, cuda_ctx_dst->device, info.nccl_coms[cuda_ctx_src->device],
info.all_ctx[cuda_ctx_src->device]->stream());
ggml_cuda_set_device(cuda_ctx_dst->device);
auto status2 = ncclRecv(dst->data, nbytes, ncclUint8, cuda_ctx_src->device, info.nccl_coms[cuda_ctx_dst->device],
info.all_ctx[cuda_ctx_dst->device]->stream());
ncclGroupEnd();
GGML_ASSERT(status1 == ncclSuccess && status2 == ncclSuccess);
return true;
#else
ggml_cuda_set_device(cuda_ctx_src->device);
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
#endif
}
#endif
}
@@ -3733,12 +3798,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
#endif
#ifndef NDEBUG
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) {
assert(node->src[j]->buffer);
}
}
//assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
//for (int j = 0; j < GGML_MAX_SRC; j++) {
// if (node->src[j] != nullptr) {
// assert(node->src[j]->buffer);
// }
//}
#endif // NDEBUG
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, cgraph, i);
@@ -4044,6 +4109,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
}
return false;
} break;
case GGML_OP_REDUCE:
case GGML_OP_FAKE_CPY:
case GGML_OP_ARGMAX:
return true;
case GGML_OP_HADAMARD:
@@ -4372,6 +4439,13 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] con
#endif
}
#ifdef GGML_USE_NCCL
if (!enable_p2p) {
printf("================== P2P disabled, but needed for NCCL\n");
enable_p2p = true;
}
#endif
#if !defined(GGML_CUDA_NO_PEER_COPY)
if (enable_p2p) {
ggml_cuda_set_peer_access(device);

View File

@@ -34,6 +34,10 @@
#include "vendors/cuda.h"
#endif // defined(GGML_USE_HIPBLAS)
#ifdef GGML_USE_NCCL
#include <nccl.h>
#endif
#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -738,6 +742,8 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ5_K_R4> {
//////////////////////
struct ggml_backend_cuda_context;
struct ggml_cuda_device_info {
int device_count;
@@ -754,6 +760,12 @@ struct ggml_cuda_device_info {
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
#ifdef GGML_USE_NCCL
ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES];
bool have_nccl;
#endif
};
const ggml_cuda_device_info & ggml_cuda_info();
@@ -844,6 +856,9 @@ struct ggml_backend_cuda_context {
bool use_cuda_graph = true;
#endif
void * copy_buffer = nullptr;
size_t copy_size = 0;
explicit ggml_backend_cuda_context(int device);
~ggml_backend_cuda_context();
@@ -889,3 +904,5 @@ struct ggml_backend_cuda_context {
return pool(device);
}
};
cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device);

View File

@@ -0,0 +1,319 @@
//
// Copyright (C) 2023-2024 The ggml authors
// Copyright (C) 2024 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//
#include "reduce.cuh"
#include <chrono>
template <typename T, int block_size>
static __global__ void k_add(int nelem, const T * src, T * dst) {
int i = blockIdx.x*block_size + threadIdx.x;
if (i >= nelem) return;
dst[i] += src[i];
}
template <typename T, int block_size>
static __global__ void k_add_sym(int nelem, T * src, T * dst) {
int i = blockIdx.x*block_size + threadIdx.x;
if (i >= nelem) return;
dst[i] += src[i];
src[i] = dst[i];
}
struct copy_task {
void * ptrs[GGML_CUDA_MAX_DEVICES];
int nptr;
int nelem;
};
template <typename T, int block_size>
static __global__ void k_reduce_add(copy_task task) {
int i = blockIdx.x*block_size + threadIdx.x;
if (i >= task.nelem) return;
auto dst = (T *)task.ptrs[0];
for (int j = 1; j < task.nptr; ++j) {
auto src = (T *)task.ptrs[j];
dst[i] += src[i];
}
for (int j = 1; j < task.nptr; ++j) {
auto src = (T *)task.ptrs[j];
src[i] = dst[i];
}
}
void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
auto op = (ggml_op)dst->op_params[0];
GGML_ASSERT(op == GGML_OP_ADD);
int nreduce = dst->op_params[1];
int nhave = dst->op_params[2];
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(nhave >=2 && nhave <= nreduce);
auto & info = ggml_cuda_info();
#ifdef GGML_USE_NCCL
if (info.have_nccl && nhave == nreduce) { // somehow I'm not able to figure out how to use NCCL when not all GPUs participate in the reduce op
GGML_ASSERT(info.have_nccl);
GGML_ASSERT(info.device_count == nreduce);
auto type = dst->type;
//int device = ctx.device;
if (nreduce != info.device_count) {
GGML_ABORT("Not implemented");
}
//auto tim1 = std::chrono::steady_clock::now();
auto data_type = type == GGML_TYPE_F32 ? ncclFloat : ncclHalf;
if (nreduce == 4 && dst->ne[1] > 32) {
auto com = info.nccl_coms + info.device_count;
static const int devs[8] = {0,1, 2,3, 0,2, 1,3};
for (int ip = 0; ip < 4; ++ip) {
ncclGroupStart();
ggml_cuda_set_device(devs[2*ip+0]);
auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data,
ggml_nelements(dst), data_type, ncclSum, com[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream());
ggml_cuda_set_device(devs[2*ip+1]);
auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data,
ggml_nelements(dst), data_type, ncclSum, com[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream());
ncclGroupEnd();
if (status1 != ncclSuccess || status2 != ncclSuccess) {
fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
GGML_ABORT("Fatal error");
}
}
}
else if (nreduce == 3 && dst->ne[1] > 32) {
auto com = info.nccl_coms + info.device_count;
static const int devs[4] = {0,1, 0,2};
for (int ip = 0; ip < 2; ++ip) {
ncclGroupStart();
ggml_cuda_set_device(devs[2*ip+0]);
auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data,
ggml_nelements(dst), data_type, ncclSum, com[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream());
ggml_cuda_set_device(devs[2*ip+1]);
auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data,
ggml_nelements(dst), data_type, ncclSum, com[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream());
ncclGroupEnd();
if (status1 != ncclSuccess || status2 != ncclSuccess) {
fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
GGML_ABORT("Fatal error");
}
}
ncclGroupStart();
ggml_cuda_set_device(0);
auto status1 = ncclSend(dst->src[0]->data, ggml_nelements(dst), data_type, 1, com[0], info.all_ctx[0]->stream());
ggml_cuda_set_device(1);
auto status2 = ncclRecv(dst->src[1]->data, ggml_nelements(dst), data_type, 0, com[1], info.all_ctx[1]->stream());
ncclGroupEnd();
if (status1 != ncclSuccess || status2 != ncclSuccess) {
fprintf(stderr, "%s: ncclSend/Recv failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
GGML_ABORT("Fatal error");
}
}
else {
ncclGroupStart();
for (int i = 0; i < nreduce; ++i) {
ncclComm_t this_comm;
if (nhave == nreduce) {
this_comm = info.nccl_coms[i];
} else {
auto status = ncclCommSplit(info.nccl_coms[i], dst->src[i] ? 0 : NCCL_SPLIT_NOCOLOR, i, &this_comm, NULL);
GGML_ASSERT(status == ncclSuccess);
}
ggml_cuda_set_device(i);
auto stream = info.all_ctx[i]->stream();
GGML_ASSERT(stream);
auto status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
dst->src[i] ? dst->src[i]->data : nullptr,
ggml_nelements(dst), data_type, ncclSum, this_comm, stream);
if (status != ncclSuccess) {
fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status);
GGML_ABORT("Fatal error");
}
}
ncclGroupEnd();
}
ggml_cuda_set_device(ctx.device);
return;
}
#endif
GGML_ASSERT(dst->data == dst->src[ctx.device]->data);
auto nbytes = ggml_nbytes(dst);
if (nhave == 2 && (nhave == nreduce || dst->ne[1] <= 8)) {
int idx[2];
int ii = 0;
for (int i = 0; i < nreduce; ++i) {
if (dst->src[i]) {
idx[ii++] = i;
}
}
// With P2P access enabled, we can access peer memory so as if it was local.
// Hence, we can launch two reduce kernels, one on each device, each kernel
// processing half of the data. This very simply approach almost matches NCCL
// performance (I see ~1% lower PP and TG performance on my 2x3090 system).
for (int i = 0; i < nhave; ++i) {
GGML_ASSERT(dst->src[idx[i]]->type == dst->type);
GGML_ASSERT(ggml_are_same_shape(dst, dst->src[idx[i]]));
ggml_cuda_set_device(idx[i]);
if (!info.all_ctx[idx[i]]->copy_event) {
CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[idx[i]]->copy_event, cudaEventDisableTiming));
}
CUDA_CHECK(cudaEventRecord(info.all_ctx[idx[i]]->copy_event, info.all_ctx[idx[i]]->stream()));
}
auto nelem = ggml_nelements(dst);
auto nelem_half = (nelem + 1)/2;
for (int i = 0; i < nhave; ++i) {
ggml_cuda_set_device(idx[i]);
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[idx[i]]->stream(), info.all_ctx[idx[(i+1)%2]]->copy_event, 0));
auto this_nelem = std::min(nelem_half, nelem - nelem_half);
int nblock = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
if (dst->type == GGML_TYPE_F16) {
auto src_ptr = (half *)dst->src[idx[i]]->data + i*nelem_half;
auto dst_ptr = (half *)dst->src[idx[(i+1)%2]]->data + i*nelem_half;
k_add_sym<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[idx[i]]->stream()>>>(this_nelem, src_ptr, dst_ptr);
} else {
auto src_ptr = (float *)dst->src[idx[i]]->data + i*nelem_half;
auto dst_ptr = (float *)dst->src[idx[(i+1)%2]]->data + i*nelem_half;
k_add_sym<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[idx[i]]->stream()>>>(this_nelem, src_ptr, dst_ptr);
}
}
for (int i = 0; i < nhave; ++i) {
ggml_cuda_set_device(idx[i]);
CUDA_CHECK(cudaEventRecord(info.all_ctx[idx[i]]->copy_event, info.all_ctx[idx[i]]->stream()));
ggml_cuda_set_device(idx[(i+1)%2]);
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[idx[(i+1)%2]]->stream(), info.all_ctx[idx[i]]->copy_event));
}
ggml_cuda_set_device(ctx.device);
return;
}
int idx[GGML_CUDA_MAX_DEVICES];
{
int ii = 0;
bool have_this_device = false;
for (int i = 0; i < nreduce; ++i) {
if (dst->src[i]) {
idx[ii++] = i;
if (i == ctx.device) have_this_device = true;
}
}
GGML_ASSERT(ii == nhave);
GGML_ASSERT(have_this_device);
}
if (nhave == 4 && dst->ne[1] <= 8) {
for (int ii = 0; ii < nhave; ++ii) {
int i = idx[ii];
GGML_ASSERT(dst->src[i]->type == dst->type);
GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
ggml_cuda_set_device(i);
if (!info.all_ctx[i]->copy_event) {
CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
}
}
auto nelem = ggml_nelements(dst);
for (int ii = 0; ii < nhave/2; ++ii) {
int i = idx[2*ii+0];
ggml_cuda_set_device(i);
int nblocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
copy_task task;
task.nptr = nhave/2;
task.nelem = nelem;
task.ptrs[0] = (char *)dst->src[i]->data;
int j = idx[2*ii+1];
CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream()));
task.ptrs[1] = (char *)dst->src[j]->data;
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
if (dst->type == GGML_TYPE_F16) {
k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
} else {
k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
}
}
for (int ii = 0; ii < nhave/2; ++ii) {
int i = idx[2*ii+0];
ggml_cuda_set_device(i);
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
}
for (int ii = 0; ii < nhave/2; ++ii) {
int i = idx[2*ii+1];
ggml_cuda_set_device(i);
int nblocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
copy_task task;
task.nptr = nhave/2;
task.nelem = nelem;
task.ptrs[0] = (char *)dst->src[i]->data;
int j = idx[(2*ii+2)%nhave];
task.ptrs[1] = (char *)dst->src[j]->data;
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
if (dst->type == GGML_TYPE_F16) {
k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
} else {
k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
}
}
for (int ii = 0; ii < nhave/2; ++ii) {
int i = idx[2*ii+1];
ggml_cuda_set_device(i);
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
}
for (int ii = 0; ii < nhave/2; ++ii) {
int i = idx[(2*ii+2)%nhave];
ggml_cuda_set_device(i);
int j = idx[2*ii+1];
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
}
ggml_cuda_set_device(ctx.device);
return;
}
auto required_size = nbytes*(nhave-1);
if (required_size > ctx.copy_size) {
if (ctx.copy_buffer) {
CUDA_CHECK(cudaFree(ctx.copy_buffer));
}
CUDA_CHECK(ggml_cuda_device_malloc(&ctx.copy_buffer, required_size, ctx.device));
ctx.copy_size = required_size;
}
auto ptr = (char *)ctx.copy_buffer;
for (int ii = 0; ii < nhave; ++ii) {
int i = idx[ii];
GGML_ASSERT(dst->src[i]->type == dst->type);
GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
if (i == ctx.device) continue;
ggml_cuda_set_device(i);
CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream()));
if (!info.all_ctx[i]->copy_event) {
CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
}
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
ptr += nbytes;
}
auto nelem = ggml_nelements(dst);
int num_blocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
ggml_cuda_set_device(ctx.device);
ptr = (char *)ctx.copy_buffer;
for (int ii = 0; ii < nhave; ++ii) {
int i = idx[ii];
if (i == ctx.device) continue;
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
if (dst->type == GGML_TYPE_F16) {
k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const half *)ptr, (half *)dst->data);
} else {
k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const float *)ptr, (float *)dst->data);
}
ptr += nbytes;
}
if (!ctx.copy_event) {
CUDA_CHECK(cudaEventCreateWithFlags(&ctx.copy_event, cudaEventDisableTiming));
}
CUDA_CHECK(cudaEventRecord(ctx.copy_event, ctx.stream()));
for (int ii = 0; ii < nhave; ++ii) {
int i = idx[ii];
if (i == ctx.device) continue;
ggml_cuda_set_device(i);
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0));
CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, info.all_ctx[i]->stream()));
}
ggml_cuda_set_device(ctx.device);
}

View File

@@ -0,0 +1,7 @@
#include "common.cuh"
#define CUDA_REDUCE_BLOCK_SIZE 256
void ggml_cuda_op_reduce(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_fake_cpy(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -4291,9 +4291,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
"GLU",
"REDUCE",
"FAKE_CPY",
};
static_assert(GGML_OP_COUNT == 92, "GGML_OP_COUNT != 92");
static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -4398,10 +4401,13 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss(x,y)",
"cross_entropy_loss_back(x,y)",
"glu(x),"
"glu(x),",
"reduce(x1,x2,...)",
"fake_cpy(x,y)",
};
static_assert(GGML_OP_COUNT == 92, "GGML_OP_COUNT != 92");
static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -6060,6 +6066,43 @@ struct ggml_tensor * ggml_dup_inplace(
return ggml_dup_impl(ctx, a, true);
}
struct ggml_tensor * ggml_reduce(
struct ggml_context * ctx,
struct ggml_tensor ** a,
int n,
enum ggml_op op) {
GGML_ASSERT(n > 1 && n <= GGML_MAX_SRC);
GGML_ASSERT(op == GGML_OP_ADD); // currently we only handle reduce_add
struct ggml_tensor * last = NULL;
int nhave = 0;
for (int j = 0; j < n; ++j) {
if (a[j]) { ++nhave; last = a[j]; }
}
GGML_ASSERT(last);
GGML_ASSERT(nhave > 1);
struct ggml_tensor * result = ggml_view_tensor(ctx, last);
for (int j = 0; j < n; ++j) {
result->src[j] = a[j];
}
result->op = GGML_OP_REDUCE;
result->op_params[0] = (int)op;
result->op_params[1] = n;
result->op_params[2] = nhave;
return result;
}
struct ggml_tensor * ggml_fake_cpy(
struct ggml_context * ctx,
struct ggml_tensor * dst,
struct ggml_tensor * src) {
struct ggml_tensor * result = ggml_view_tensor(ctx, dst);
result->op = GGML_OP_FAKE_CPY;
result->src[0] = dst;
result->src[1] = src;
return result;
}
// ggml_add
static struct ggml_tensor * ggml_add_impl(
@@ -8433,6 +8476,21 @@ struct ggml_tensor * ggml_get_rows(
if (a->type == GGML_TYPE_I32) {
type = a->type;
}
//if (a->op == GGML_OP_REDUCE) {
// //printf("======================= %s(%s)\n", __func__, a->name);
// struct ggml_tensor * result = NULL;
// for (int j = a->op_params[1]-1; j >= 0; --j) {
// if (a->src[j]) {
// struct ggml_tensor * aj = ggml_get_rows(ctx, a->src[j], b);
// if (result == NULL) result = ggml_view_tensor(ctx, aj);
// result->src[j] = aj;
// }
// }
// GGML_ASSERT(result);
// return result;
//}
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
result->op = GGML_OP_GET_ROWS;
@@ -22675,6 +22733,14 @@ static int ggml_compute_forward(struct ggml_compute_params * params, struct ggml
#endif
switch (tensor->op) {
case GGML_OP_REDUCE:
{
GGML_ABORT("REDUCE not implemented");
}
case GGML_OP_FAKE_CPY:
{
GGML_ABORT("FAKE_CPY not implemented");
}
case GGML_OP_DUP:
{
ggml_compute_forward_dup(params, tensor);
@@ -23352,6 +23418,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
struct ggml_tensor * src2 = tensor->src[2];
switch (tensor->op) {
case GGML_OP_REDUCE:
{
GGML_ABORT("REDUCE not implemented");
}
case GGML_OP_FAKE_CPY:
{
GGML_ABORT("FAKE_CPY not implemented");
}
case GGML_OP_DUP:
{
if (src0->grad) {

View File

@@ -620,6 +620,20 @@ ggml_tensor * llm_build_context::llm_build_norm(
return cur;
}
static ggml_tensor * get_input_tensor_sm_graph(ggml_tensor * input, int id) {
auto cur = input;
if (input->op == GGML_OP_REDUCE) {
auto view_src = input->view_src;
GGML_ASSERT(view_src);
cur = input->src[id];
if (cur == view_src || !cur) {
//printf("%s: Setting input to %s for id = %d\n", __func__, view_src->name, id);
cur = input;
}
}
return cur;
}
ggml_tensor * llm_build_context::llm_build_ffn(
ggml_context * ctx,
llama_context & lctx,
@@ -637,19 +651,21 @@ ggml_tensor * llm_build_context::llm_build_ffn(
ggml_tensor * act_scales,
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
const llm_build_cb & cb, int il, ggml_cgraph * graph) {
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input,
bool is_norm, ggml_tensor * add_extra) {
if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s &&
up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR &&
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
//printf("%s: %s\n", __func__, ggml_op_name(input->op));
auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
auto u = (ggml_split_tensor_t *)up->extra;
auto g = (ggml_split_tensor_t *)gate->extra;
auto d = (ggml_split_tensor_t *)down->extra;
GGML_ASSERT(u->n_device == g->n_device && u->n_device == d->n_device);
std::vector<ggml_tensor *> ffn;
ffn.reserve(u->n_device);
std::vector<ggml_tensor *> ffn(u->n_device, nullptr);
int id_last = -1;
for (int id = 0; id < u->n_device; ++id) {
int il_cb = 1000*(id+1) + il;
auto split_u = u->splits[id];
@@ -657,15 +673,21 @@ ggml_tensor * llm_build_context::llm_build_ffn(
auto split_d = d->splits[id];
GGML_ASSERT((!split_u && !split_g && !split_d) || (split_u && split_g && split_d));
if (!split_u) continue;
auto cur = input;
auto cur = get_input_tensor_sm_graph(input, id);
if (ffn_norm && ffn_norm->extra) {
auto norm = (ggml_split_tensor_t *)ffn_norm->extra;
GGML_ASSERT(norm->splits[id]);
cur = llm_build_norm(ctx, input, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
if (is_norm) {
cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM, cb, il);
GGML_ASSERT(cur->src[0]->op == GGML_OP_NORM);
cur->src[0]->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff;
} else {
cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
}
cb(cur, "ffn_inp_normed", il_cb);
}
else if (input->type != GGML_TYPE_F32) {
cur = ggml_cast(ctx, input, GGML_TYPE_F32);
else if (cur->type != GGML_TYPE_F32) {
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
}
cur = ggml_fused_up_gate(ctx, split_u, split_g, cur, unary_op);
cb(cur, "ffn_up_gate", il_cb);
@@ -681,32 +703,31 @@ ggml_tensor * llm_build_context::llm_build_ffn(
if (graph) {
ggml_build_forward_expand(graph, cur);
}
ffn.push_back(cur);
ffn[id] = cur;
id_last = id;
}
if (ffn.size() == 1) return ffn.front();
auto cur = ggml_add(ctx, ffn[0], ffn[1]);
cb(cur, "combine_ffn", il);
cur->op_params[0] = 0xff;
for (int id = 2; id < int(ffn.size()); ++id) {
cur = ggml_add(ctx, cur, ffn[id]);
cb(cur, "combine_ffn", il);
GGML_ASSERT(id_last >= 0);
if (add_input) {
ffn[id_last] = ggml_add(ctx, ffn[id_last], input);
cb(ffn[id_last], "ffn_with_inp", il);
}
if (ffn.size() > 2) {
cur->op_params[0] = 0xff;
if (add_extra) {
ffn[id_last] = ggml_add(ctx, ffn[id_last], add_extra);
cb(ffn[id_last], "ffn_with_inp", il);
}
//if (cur->type != GGML_TYPE_F32) {
// cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
//}
auto cur = ggml_reduce(ctx, ffn.data(), u->n_device, GGML_OP_ADD);
cb(cur, "ffn_combined", il);
ggml_build_forward_expand(graph, cur);
return cur;
}
auto cur = input;
if (ffn_norm) {
input = llm_build_norm(ctx, input, lctx.model.hparams, ffn_norm, NULL, LLM_NORM_RMS, cb, il);
cur = llm_build_norm(ctx, cur, lctx.model.hparams, ffn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
cb(input, "ffn_norm", il);
}
else if (input->type != GGML_TYPE_F32) {
input = ggml_cast(ctx, input, GGML_TYPE_F32);
if (cur->type != GGML_TYPE_F32) {
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
}
if (lctx.cparams.fused_up_gate &&
@@ -714,7 +735,7 @@ ggml_tensor * llm_build_context::llm_build_ffn(
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
auto cur = ggml_fused_up_gate(ctx, up, gate, input, unary_op);
cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op);
cb(cur, "ffn_up_gate", il);
if (down) {
cur = llm_build_lora_mm(lctx, ctx, down, cur);
@@ -733,10 +754,18 @@ ggml_tensor * llm_build_context::llm_build_ffn(
cur = ggml_mul(ctx, cur, down_s);
cb(cur, "ffn_down_s", il);
}
if (add_input) {
cur = ggml_add(ctx, cur, input);
cb(cur, "ffn_out_with_inp", il);
}
if (add_extra) {
cur = ggml_add(ctx, cur, add_extra);
cb(cur, "ffn_out_with_inp", il);
}
return cur;
}
struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, input) : input;
struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
cb(tmp, "ffn_up", il);
if (up_b) {
@@ -749,7 +778,6 @@ ggml_tensor * llm_build_context::llm_build_ffn(
cb(tmp, "ffn_up_s", il);
}
auto cur = input;
if (gate) {
switch (type_gate) {
case LLM_FFN_SEQ:
@@ -849,6 +877,15 @@ ggml_tensor * llm_build_context::llm_build_ffn(
cb(cur, "ffn_down_s", il);
}
if (add_input) {
cur = ggml_add(ctx, cur, input);
cb(cur, "ffn_out_with_inp", il);
}
if (add_extra) {
cur = ggml_add(ctx, cur, add_extra);
cb(cur, "ffn_out_with_inp", il);
}
return cur;
}
@@ -868,7 +905,9 @@ ggml_tensor * llm_build_context::llm_build_moe_ffn(
bool scale_w,
float w_scale,
llm_expert_gating_func_type gating_op,
const llm_build_cb & cb, int il, ggml_cgraph * graph) {
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
auto input = cur;
int64_t n_embd = cur->ne[0];
int64_t n_tokens = cur->ne[1];
@@ -1040,20 +1079,30 @@ llm_expert_gating_func_type gating_op,
if (lctx.cparams.fused_mmad) {
experts = ggml_mul_multi_add(ctx, experts, weights);
cb(experts, "ffn_moe_weighted", il);
if (add_input) {
experts = ggml_add(ctx, experts, input);
cb(experts, "ffn_out_with_inp", il);
}
return experts;
}
experts = ggml_mul(ctx, experts, weights);
cb(experts, "ffn_moe_weighted", il);
}
ggml_tensor * result;
if (n_expert_used == 1) {
return ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0));
result = ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0));
}
if (n_expert_used == 2) {
return ggml_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0),
result = ggml_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0),
ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], experts->nb[1]));
}
return ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used);
result = ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used);
if (add_input) {
cb(result, "ffn_out", il);
result = ggml_add(ctx, result, input);
}
return result;
}
@@ -1076,7 +1125,7 @@ ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama
float w_scale,
llm_expert_gating_func_type gating_op,
llm_ffn_op_type type_op_shexp,
const llm_build_cb & cb, int il, ggml_cgraph * graph) {
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
auto split_up_exps = (ggml_split_tensor_t *)up_exps->extra;
auto split_gate_exps = (ggml_split_tensor_t *)gate_exps->extra;
@@ -1092,10 +1141,10 @@ llm_expert_gating_func_type gating_op,
if (ffn_norm) {
auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm;
GGML_ASSERT(the_ffn_norm);
cur = llm_build_norm(ctx, input, lctx.model.hparams, the_ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
cur = llm_build_norm(ctx, cur, lctx.model.hparams, the_ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "ffn_inp_normed", il);
}
else if (cur->type != GGML_TYPE_F32) {
if (cur->type != GGML_TYPE_F32) {
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
}
auto the_gate_inp = gate_inp->extra ? ((ggml_split_tensor_t *)gate_inp->extra)->splits[lctx.model.main_gpu] : gate_inp;
@@ -1110,8 +1159,12 @@ llm_expert_gating_func_type gating_op,
the_exp_probs_b,
n_expert, n_expert_used,
type_op, norm_w, scale_w, w_scale,
gating_op, cb, il, graph);
gating_op, cb, il, graph, false);
cb(routed_out, "routed_out", il);
if (add_input) {
routed_out = ggml_add(ctx, routed_out, input);
cb(routed_out, "routed_out_with_inp", il);
}
ggml_build_forward_expand(graph, routed_out);
if (up_shexp && gate_shexp && down_shexp) {
@@ -1176,26 +1229,27 @@ llm_expert_gating_func_type gating_op,
}
GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps);
GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device);
std::vector<ggml_tensor *> results; results.reserve(split_up_exps->n_device);
std::vector<ggml_tensor *> results(split_up_exps->n_device, nullptr);
GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) ||
( split_up_shexp && split_gate_shexp && split_down_shexp));
auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra;
GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device);
auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr;
GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device);
int last_id = -1;
for (int id = 0; id < split_up_exps->n_device; ++id) {
GGML_ASSERT((split_up_exps->splits[id] && split_gate_exps->splits[id] && split_down_exps->splits[id]) ||
(!split_up_exps->splits[id] && !split_gate_exps->splits[id] && !split_down_exps->splits[id]));
if (!split_up_exps->splits[id]) continue;
int il_cb = 1000*(id + 1) + il;
auto cur = input;
auto cur = get_input_tensor_sm_graph(input, id);
if (ffn_norm) {
auto split_ffn_norm = (ggml_split_tensor_t *)ffn_norm->extra;
GGML_ASSERT(split_ffn_norm && split_ffn_norm->n_device == split_up_exps->n_device);
cur = llm_build_norm(ctx, input, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il);
cur = llm_build_norm(ctx, cur, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "ffn_inp_normed", il_cb);
}
else if (cur->type != GGML_TYPE_F32) {
if (cur->type != GGML_TYPE_F32) {
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
}
auto routed_out = llm_build_moe_ffn(ctx, lctx, cur,
@@ -1206,7 +1260,7 @@ llm_expert_gating_func_type gating_op,
split_exp_probs_b ? split_exp_probs_b->splits[id] : nullptr,
n_expert, n_expert_used,
type_op, norm_w, scale_w, w_scale,
gating_op, cb, il, graph);
gating_op, cb, il, graph, false);
cb(routed_out, "routed_out", il_cb);
if (split_up_shexp) {
@@ -1229,19 +1283,20 @@ llm_expert_gating_func_type gating_op,
cur = ggml_cast(ctx, cur, GGML_TYPE_F16);
cb(cur, "ffn_out_f16", il_cb);
}
ggml_build_forward_expand(graph, routed_out);
results.push_back(cur);
ggml_build_forward_expand(graph, cur);
results[id] = cur;
last_id = id;
}
GGML_ASSERT(last_id >= 0);
if (add_input) {
results[last_id] = ggml_add(ctx, results[last_id], input);
cb(results[last_id], "ffn_inp_added", il);
}
GGML_ASSERT(!results.empty());
if (results.size() == 1) return results.front();
auto cur = ggml_add(ctx, results[0], results[1]);
cur->op_params[0] = 0xff;
cb(cur, "ffn_combined", il);
for (int id = 2; id < int(results.size()); ++id) {
cur = ggml_add(ctx, cur, results[id]);
cb(cur, "ffn_combined", il);
}
auto cur = ggml_reduce(ctx, results.data(), split_up_exps->n_device, GGML_OP_ADD);
cb(cur, "moe_ffn_combined", il);
ggml_build_forward_expand(graph, cur);
return cur;
}
@@ -1754,7 +1809,7 @@ ggml_cgraph * llm_build_context::build_llama() {
// self-attention
if (use_rope) {
cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr,
this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il);
this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il, true, false, true);
}
else {
@@ -1801,15 +1856,18 @@ ggml_cgraph * llm_build_context::build_llama() {
Kcur, Vcur, Qcur, this_KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il, nullptr,
this_n_swa);
}
//printf("%s: attn result for layer %d is %s, %s\n", __func__, il, cur->name, ggml_op_name(cur->op));
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
n_tokens = n_outputs;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
cb(cur, "last_attn", il);
cb(inpSA, "last_ffn_inp", il);
if (!use_rope) {
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
cb(inpSA, "last_ffn_inp", il);
}
}
// For Granite architecture
@@ -1818,8 +1876,13 @@ ggml_cgraph * llm_build_context::build_llama() {
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
ggml_tensor * ffn_inp;
if (use_rope) {
ffn_inp = cur;
} else {
ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
}
// feed-forward network
if (model.layers[il].ffn_gate_inp == nullptr) {
@@ -1829,7 +1892,7 @@ ggml_cgraph * llm_build_context::build_llama() {
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf, true);
cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_LLAMA4) {
// llama4 MoE
@@ -1846,7 +1909,7 @@ ggml_cgraph * llm_build_context::build_llama() {
LLM_FFN_SILU, false,
false, 0.0,
LLM_EXPERT_GATING_FUNC_SIGMOID,
cb, il, gf);
cb, il, gf, true);
// Shared experts
ggml_tensor * shexp_out = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp_normed,
@@ -1875,9 +1938,10 @@ ggml_cgraph * llm_build_context::build_llama() {
LLM_FFN_SILU, true,
false, 0.0,
LLM_EXPERT_GATING_FUNC_SOFTMAX,
cb, il, gf);
cb, il, gf, true);
cb(cur, "ffn_moe_out", il);
}
//printf("%s: ffn result for layer %d is %s, %s\n", __func__, il, cur->name, ggml_op_name(cur->op));
// For Granite architecture
if (hparams.f_residual_scale) {
@@ -1885,8 +1949,8 @@ ggml_cgraph * llm_build_context::build_llama() {
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
//cur = ggml_add(ctx0, cur, ffn_inp);
//cb(cur, "ffn_out", il);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
@@ -3933,23 +3997,26 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
//struct ggml_tensor * inpSA = inpL;
// norm
//cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
//cb(cur, "attn_norm", il);
cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il);
cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0,
il, true, false, true);
//printf("%s: attn = %s(%s)\n", __func__, cur->name, ggml_op_name(cur->op));
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
//inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
auto ffn_inp = cur;
//struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
//cb(ffn_inp, "ffn_inp", il);
cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp,
model.layers[il].ffn_gate_inp, nullptr,
@@ -3963,9 +4030,11 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
n_expert, n_expert_used,
LLM_FFN_SILU, true, false, 0.0f,
LLM_EXPERT_GATING_FUNC_SOFTMAX,
LLM_FFN_SILU, cb, il, gf);
LLM_FFN_SILU, cb, il, gf, true);
cur = ggml_add(ctx0, cur, ffn_inp);
//printf("%s: ffn = %s(%s)\n", __func__, cur->name, ggml_op_name(cur->op));
//cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
@@ -6818,7 +6887,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
// self-attention
if (rope_cache == nullptr) {
cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il);
cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true);
} else {
// Pre-attention norm
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
@@ -6862,8 +6931,13 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
}
// residual connection for attention output
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
ggml_tensor * ffn_inp;
if (rope_cache) {
ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
} else {
ffn_inp = cur;
}
if ((uint32_t) il < hparams.n_layer_dense_lead) {
// dense FFN
@@ -6872,7 +6946,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf, true);
cb(cur, "ffn_out", il);
} else {
cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp,
@@ -6887,39 +6961,11 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm, true, hparams.expert_weights_scale,
(llm_expert_gating_func_type) hparams.expert_gating_func,
LLM_FFN_SILU, cb, il, gf);
//// Post-attention norm
//cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il);
//cb(cur, "post_attn_norm", il);
//// MoE FFN
//auto routed_out = llm_build_moe_ffn(ctx0, lctx, cur,
// model.layers[il].ffn_gate_inp,
// model.layers[il].ffn_up_exps,
// model.layers[il].ffn_gate_exps,
// model.layers[il].ffn_down_exps,
// model.layers[il].ffn_exp_probs_b,
// n_expert, n_expert_used,
// LLM_FFN_SILU, hparams.expert_weights_norm,
// true, hparams.expert_weights_scale,
// (enum llm_expert_gating_func_type) hparams.expert_gating_func,
// cb, il, gf);
//cb(routed_out, "routed_out", il);
//auto shared_out = llm_build_ffn(ctx0, lctx, nullptr, cur,
// model.layers[il].ffn_up_shexp, NULL, NULL,
// model.layers[il].ffn_gate_shexp, NULL, NULL,
// model.layers[il].ffn_down_shexp, NULL, NULL,
// NULL,
// LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
//cb(shared_out, "ffn_shexp_out", il);
//cur = ggml_add(ctx0, routed_out, shared_out);
//cb(cur, "ffn_out", il);
LLM_FFN_SILU, cb, il, gf, true);
}
// residual and context vector
cur = ggml_add(ctx0, cur, ffn_inp);
//cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
@@ -7229,48 +7275,25 @@ ggml_cgraph * llm_build_context::build_cohere2() {
const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
// norm
auto attn_norm = model.layers[il].attn_norm;
int id = -1;
if (attn_norm->extra) {
auto extra = (ggml_split_tensor_t *)attn_norm->extra;
for (int i = extra->n_device-1; i >= 0; --i) {
if (extra->splits[i]) {
attn_norm = extra->splits[i];
id = i;
break;
}
}
}
cur = llm_build_norm(ctx0, inpL, hparams, attn_norm, NULL, LLM_NORM, cb, il);
if (id >= 0) {
ggml_backend_sched_set_tensor_backend(lctx.sched, cur->src[0], ggml_backend_sched_get_backend(lctx.sched, id));
}
cb(cur, "attn_norm", il);
auto ffn_inp = cur;
// self-attention
auto attn_out = build_std_attention(gf, nullptr, cur, inp_pos, nullptr, KQ_mask_l, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), 0.f,
is_sliding ? hparams.n_swa : 0, il, is_sliding, true);
auto attn_out = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask_l, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), 0.f,
is_sliding ? hparams.n_swa : 0, il, is_sliding, false, true, true);
cb(attn_out, "attn_out", il);
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
attn_out = ggml_get_rows(ctx0, attn_out, inp_out_ids);
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
// feed-forward network
cur = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
cur = llm_build_ffn(ctx0, lctx, model.layers[il].attn_norm, inpL, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
cb, il, gf);
cb, il, gf, false, true, attn_out);
cb(cur, "ffn_out", il);
// add together residual + FFN + self-attention
cur = ggml_add(ctx0, cur, attn_out);
cur = ggml_add(ctx0, cur, inpL);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
@@ -7279,9 +7302,6 @@ ggml_cgraph * llm_build_context::build_cohere2() {
}
cur = inpL;
//if (cur->type != GGML_TYPE_F32) {
// cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
//}
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
cb(cur, "result_norm", -1);
@@ -9312,10 +9332,11 @@ ggml_cgraph * llm_build_context::llama_build_graph(
ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tensor * the_attn_norm,
ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in,
ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale,
int n_swa, int il, bool do_rope, bool add_graph_split) {
int n_swa, int il, bool do_rope, bool add_graph_split, bool add_input, bool is_norm) {
if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn &&
model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) {
if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) {
//printf("%s: %s\n", __func__, ggml_op_name(input->op));
ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr;
auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra;
auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra;
@@ -9342,7 +9363,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
bv = (ggml_split_tensor_t *)model.layers[il].bv->extra;
GGML_ASSERT(bv->n_device == wq->n_device);
}
std::vector<ggml_tensor*> attn; attn.reserve(wq->n_device);
std::vector<ggml_tensor*> attn(wq->n_device, nullptr);
int id_last = -1;
for (int id = 0; id < wq->n_device; ++id) {
int il_cb = 1000*(id+1) + il;
auto split_wq = wq->splits[id];
@@ -9354,13 +9376,22 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
GGML_ASSERT((!split_wq && !split_wk && !split_wv && !split_wo && !split_kl && !split_vl) ||
(split_wq && split_wk && split_wv && split_wo && split_kl && split_vl));
if (!split_wq) continue;
auto cur = input;
auto cur = get_input_tensor_sm_graph(input, id);
if (attn_norm) {
auto split_norm = attn_norm->splits[id];
cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il_cb);
if (is_norm) {
cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM, cb, il);
GGML_ASSERT(cur->src[0]->op == GGML_OP_NORM);
cur->src[0]->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff;
} else {
cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
}
}
else if (cur->type != GGML_TYPE_F32) {
//if (attn_norm) {
// auto split_norm = attn_norm->splits[id];
// cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
// cb(cur, "attn_norm", il_cb);
//}
if (cur->type != GGML_TYPE_F32) {
cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
}
auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ?
@@ -9486,42 +9517,24 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
cur = ggml_cast(ctx0, cur, GGML_TYPE_F16);
}
ggml_build_forward_expand(gf, cur);
attn.push_back(cur);
attn[id] = cur;
id_last = id;
}
GGML_ASSERT(!attn.empty());
if (attn.size() == 1) return attn.front();
//if (attn.size() > 2 && attn.size()%2 == 0) {
// for (int id = 0; id < int(attn.size()/2); ++id) {
// attn[id] = ggml_add(ctx0, attn[2*id+0], attn[2*id+1]);
// attn[id]->op_params[0] = 0xff;
// }
// attn.resize(attn.size()/2);
// auto cur = ggml_add(ctx0, attn[0], attn[1]);
// cur->op_params[0] = 0xff;
// cur->op_params[0] = 0xff;
// for (int id = 2; id < (int)attn.size(); ++id) {
// cur = ggml_add(ctx0, cur, attn[id]);
// cb(cur, "combine_attn", il);
// }
// return cur;
//}
auto cur = ggml_add(ctx0, attn[0], attn[1]);
cb(cur, "combine_attn", il);
cur->op_params[0] = 0xff;
for (int id = 2; id < (int)attn.size(); ++id) {
cur = ggml_add(ctx0, cur, attn[id]);
cb(cur, "combine_attn", il);
}
if (attn.size() > 2) {
cur->op_params[0] = 0xff;
GGML_ASSERT(id_last >= 0);
if (add_input) {
attn[id_last] = ggml_add(ctx0, attn[id_last], input);
cb(attn[id_last], "attn_out_with_input", il);
}
auto cur = ggml_reduce(ctx0, attn.data(), wq->n_device, GGML_OP_ADD);
ggml_build_forward_expand(gf, cur);
cb(cur, "attn_combined", il);
return cur;
}
}
auto cur = input;
if (the_attn_norm) {
cur = llm_build_norm(ctx0, cur, hparams, the_attn_norm, NULL, LLM_NORM_RMS, cb, il);
cur = llm_build_norm(ctx0, cur, hparams, the_attn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
}
@@ -9549,5 +9562,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);
if (add_input) {
cb(cur, "attn_out", il);
cur = ggml_add(ctx0, cur, input);
}
return cur;
}

View File

@@ -335,7 +335,8 @@ struct llm_build_context {
ggml_tensor * act_scales,
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr);
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false,
bool is_norm = false, ggml_tensor * add_extra = nullptr);
static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
ggml_tensor * cur,
@@ -351,7 +352,7 @@ struct llm_build_context {
bool scale_w,
float w_scale,
llm_expert_gating_func_type gating_op,
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr);
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false);
static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
ggml_tensor * cur,
@@ -367,7 +368,7 @@ llm_expert_gating_func_type gating_op,
bool scale_w,
float w_scale,
llm_expert_gating_func_type gating_op,
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr) {
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false) {
return llm_build_moe_ffn(ctx, lctx, cur,
gate_inp, nullptr,
up_exps, nullptr,
@@ -376,7 +377,7 @@ llm_expert_gating_func_type gating_op,
exp_probs_b,
n_expert, n_expert_used,
type_op, norm_w, scale_w, w_scale,
gating_op, cb, il, graph);
gating_op, cb, il, graph, add_input);
}
static ggml_tensor * llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx,
@@ -398,7 +399,7 @@ llm_expert_gating_func_type gating_op,
float w_scale,
llm_expert_gating_func_type gating_op,
llm_ffn_op_type type_op_shexp,
const llm_build_cb & cb, int il, ggml_cgraph * graph);
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input = false);
static ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids);
@@ -410,6 +411,6 @@ llm_expert_gating_func_type gating_op,
ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * attn_norm, ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * rope_factors,
ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale,
int n_swa, int il, bool do_rope = true, bool add_graph_split = false);
int n_swa, int il, bool do_rope = true, bool add_graph_split = false, bool add_input = false, bool is_norm = false);
};