Split mode "graph" for Cohere2 (#1061)

* This works and TG is descent, but PP is low

* Better

* Apply f_logit_scale before mul mat with output tensor

* This is better for PP: 600 t/s -> 700 t/s

* To not lose this again

* WIP

* Equal split

* WIP

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-12-13 20:30:08 +01:00
committed by GitHub
parent 5645be6cfc
commit f90d1fdd06
10 changed files with 211 additions and 107 deletions

View File

@@ -1639,7 +1639,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// check if we should start a new split based on the sources of the current node
bool need_new_split = false;
if (node->op == GGML_OP_ADD && node->op_params[0] == 0xff) {
if ((node->op == GGML_OP_ADD && node->op_params[0] == 0xff) ||
node->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] == 0xff) {
need_new_split = true;
}
else if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
@@ -1882,6 +1883,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_backend_sched_split * split, std::array<bool, GGML_SCHED_MAX_BACKENDS> & needs_sync,
std::vector<int32_t> & ids, std::vector<uint32_t> & unique_ids, ggml_tensor * last_ids_tensor) {
if (split->n_inputs < 1) return;
constexpr bool k_set_sync = false;
int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id];
ggml_backend_t last_input_backend = nullptr;
@@ -1906,7 +1908,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
} else {
ggml_backend_synchronize(split_backend);
}
needs_sync[split_backend_id] = false;
needs_sync[split_backend_id] = k_set_sync;
}
ggml_tensor * node = split->graph.nodes[0];
@@ -1941,7 +1943,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
ggml_backend_synchronize(ids_backend);
needs_sync[tensor_backend_id(ids_tensor)] = false;
needs_sync[tensor_backend_id(ids_tensor)] = k_set_sync;
unique_ids.resize((n_expert + 31)/32);
std::memset(unique_ids.data(), 0, unique_ids.size()*sizeof(uint32_t));
@@ -2001,7 +2003,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
int input_backend_id = tensor_backend_id(input);
if (needs_sync[input_backend_id]) {
ggml_backend_synchronize(input_backend);
needs_sync[input_backend_id] = false;
needs_sync[input_backend_id] = k_set_sync;
}
if (needs_sync[split_backend_id]) {
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@@ -2009,7 +2011,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
} else {
ggml_backend_synchronize(split_backend);
}
needs_sync[split_backend_id] = false;
needs_sync[split_backend_id] = k_set_sync;
}
ggml_backend_tensor_copy(input, input_cpy);
}

View File

@@ -3411,6 +3411,9 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
}
if (backend_src != backend_dst) {
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool());
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool());
bool needs_f16_f32_copy = false;
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -3418,39 +3421,34 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
#ifdef GGML_CUDA_NO_PEER_COPY
return false;
#else
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->ne[1] >= 32) {
//
// The goal here is to reduce traffic between GPU's, which is entirely non-negligible
// for prompt processing.
// We cast the tensor to be copied to f16, copy the f16 data peer-to-peer
// and then cast back to f32 on the destination side.
// The cost for converting to/from f16 is much ower than the cost of copying
// The cost for converting to/from f16 is much lower than the cost of copying
// two times more data over PCI-E (well, at least the 30 GB/s PCI-E I have).
// iBut for some reason the following is not working.
// But for some reason the following is slower.
// Can somebody tell me why?
//
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool(), ggml_nelements(src));
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst));
ggml_cuda_set_device(cuda_ctx_dst->device);
tmp_dst.alloc(ggml_nelements(dst));
ggml_cuda_set_device(cuda_ctx_src->device);
tmp_src.alloc(ggml_nelements(src));
auto src_f16 = *src;
src_f16.type = GGML_TYPE_F16;
for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2;
src_f16.data = tmp_src.get();
auto dst_f16 = *dst;
dst_f16.type = GGML_TYPE_F16;
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
dst_f16.data = tmp_dst.get();
ggml_cuda_set_device(cuda_ctx_src->device);
ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true);
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream()));
CUDA_CHECK(cudaMemcpyPeerAsync(tmp_dst.ptr, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&src_f16), cuda_ctx_src->stream()));
ggml_cuda_set_device(cuda_ctx_dst->device);
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream()));
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
needs_f16_f32_copy = true;
} else {
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
@@ -3467,7 +3465,15 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
// wait on dst stream for the copy to complete
ggml_cuda_set_device(cuda_ctx_dst->device);
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
if (needs_f16_f32_copy) {
auto dst_f16 = *dst;
dst_f16.type = GGML_TYPE_F16;
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
dst_f16.data = tmp_dst.get();
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
}
} else {
// src and dst are on the same backend
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));

View File

@@ -321,6 +321,15 @@ static __global__ void k_fast_add(int64_t ne0, int64_t nelem, const float * x, c
z[i] = x[i] + y[i % ne0];
}
template <typename src1_t, typename src2_t, typename dst_t>
static __global__ void k_fast_add_2(int64_t ne0, int64_t nelem, const src1_t * x, const src2_t * y, dst_t * z) {
int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= nelem) {
return;
}
z[i] = (dst_t)((float)x[i] + (float)y[i]);
}
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
if (ggml_nrows(dst->src[1]) == 1 && dst->src[0]->ne[0] == dst->src[1]->ne[0] &&
dst->type == GGML_TYPE_F32 && dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32 &&
@@ -332,6 +341,45 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
(const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data);
return;
}
if (ggml_is_contiguous(dst->src[0]) && ggml_are_same_shape(dst->src[0], dst->src[1]) && ggml_is_contiguous(dst)) {
constexpr int kBlockSize = 256;
auto nelem = ggml_nelements(dst);
int nblocks = (nelem + kBlockSize - 1)/kBlockSize;
if (dst->type == GGML_TYPE_F16) {
if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) {
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
(const half *)dst->src[0]->data, (const half *)dst->src[1]->data, (half *)dst->data);
}
else if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F32) {
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
(const half *)dst->src[0]->data, (const float *)dst->src[1]->data, (half *)dst->data);
}
else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32) {
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
(const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (half *)dst->data);
} else {
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
(const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (half *)dst->data);
}
} else {
if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) {
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
(const half *)dst->src[0]->data, (const half *)dst->src[1]->data, (float *)dst->data);
}
else if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F32) {
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
(const half *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data);
}
else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32) {
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
(const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data);
} else {
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
(const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (float *)dst->data);
}
}
return;
}
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
}

View File

@@ -542,7 +542,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
char ** dest_ptrs_d = nullptr;
int graph_cpynode_index = -1;
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
if(!disable_indirection_for_this_node && ctx.cuda_graph && ctx.cuda_graph->use_cpy_indirection) {
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
}
@@ -651,7 +651,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_type_name(src0->type), ggml_type_name(src1->type));
}
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
if(!disable_indirection_for_this_node && ctx.cuda_graph && ctx.cuda_graph->use_cpy_indirection) {
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
}
#else

View File

@@ -1,14 +1,14 @@
#include "norm.cuh"
template <int block_size>
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
template <int block_size, typename T>
static __global__ void norm_f32(const T * x, float * dst, const int ncols, const float eps) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
float2 mean_var = make_float2(0.f, 0.f);
for (int col = tid; col < ncols; col += block_size) {
const float xi = x[row*ncols + col];
const float xi = (float)x[row*ncols + col];
mean_var.x += xi;
mean_var.y += xi * xi;
}
@@ -32,7 +32,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
const float inv_std = rsqrtf(var + eps);
for (int col = tid; col < ncols; col += block_size) {
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
dst[row*ncols + col] = (T)(((float)x[row*ncols + col] - mean) * inv_std);
}
}
@@ -261,14 +261,15 @@ static __global__ void fused_rms_norm_f32_nc(
}
}
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
template <typename T>
static void norm_f32_cuda(const T * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
norm_f32<WARP_SIZE, T><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
} else {
const dim3 block_dims(1024, 1, 1);
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
norm_f32<1024, T><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
}
}
@@ -364,7 +365,7 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
@@ -373,7 +374,11 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
if (src0->type == GGML_TYPE_F32) {
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
} else {
norm_f32_cuda((const half *)src0_d, dst_d, ne00, nrows, eps, stream);
}
}
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@@ -7232,7 +7232,12 @@ static struct ggml_tensor * ggml_norm_impl(
is_node = true;
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
if (inplace && a->type != GGML_TYPE_F32) {
GGML_ABORT("Fatal error");
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : a->type == GGML_TYPE_F32 ? ggml_dup_tensor(ctx, a)
: ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], a->ne[1], a->ne[2], a->ne[3]);
ggml_set_op_params(result, &eps, sizeof(eps));