mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-28 02:11:50 +00:00
Split mode "graph" for Cohere2 (#1061)
* This works and TG is descent, but PP is low * Better * Apply f_logit_scale before mul mat with output tensor * This is better for PP: 600 t/s -> 700 t/s * To not lose this again * WIP * Equal split * WIP --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -1639,7 +1639,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
// check if we should start a new split based on the sources of the current node
|
||||
bool need_new_split = false;
|
||||
if (node->op == GGML_OP_ADD && node->op_params[0] == 0xff) {
|
||||
if ((node->op == GGML_OP_ADD && node->op_params[0] == 0xff) ||
|
||||
node->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] == 0xff) {
|
||||
need_new_split = true;
|
||||
}
|
||||
else if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
||||
@@ -1882,6 +1883,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_backend_sched_split * split, std::array<bool, GGML_SCHED_MAX_BACKENDS> & needs_sync,
|
||||
std::vector<int32_t> & ids, std::vector<uint32_t> & unique_ids, ggml_tensor * last_ids_tensor) {
|
||||
if (split->n_inputs < 1) return;
|
||||
constexpr bool k_set_sync = false;
|
||||
int split_backend_id = split->backend_id;
|
||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||
ggml_backend_t last_input_backend = nullptr;
|
||||
@@ -1906,7 +1908,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
needs_sync[split_backend_id] = false;
|
||||
needs_sync[split_backend_id] = k_set_sync;
|
||||
}
|
||||
|
||||
ggml_tensor * node = split->graph.nodes[0];
|
||||
@@ -1941,7 +1943,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
|
||||
ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
|
||||
|
||||
ggml_backend_synchronize(ids_backend);
|
||||
needs_sync[tensor_backend_id(ids_tensor)] = false;
|
||||
needs_sync[tensor_backend_id(ids_tensor)] = k_set_sync;
|
||||
|
||||
unique_ids.resize((n_expert + 31)/32);
|
||||
std::memset(unique_ids.data(), 0, unique_ids.size()*sizeof(uint32_t));
|
||||
@@ -2001,7 +2003,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
|
||||
int input_backend_id = tensor_backend_id(input);
|
||||
if (needs_sync[input_backend_id]) {
|
||||
ggml_backend_synchronize(input_backend);
|
||||
needs_sync[input_backend_id] = false;
|
||||
needs_sync[input_backend_id] = k_set_sync;
|
||||
}
|
||||
if (needs_sync[split_backend_id]) {
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
@@ -2009,7 +2011,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
needs_sync[split_backend_id] = false;
|
||||
needs_sync[split_backend_id] = k_set_sync;
|
||||
}
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
}
|
||||
|
||||
@@ -3411,6 +3411,9 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
}
|
||||
|
||||
if (backend_src != backend_dst) {
|
||||
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool());
|
||||
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool());
|
||||
bool needs_f16_f32_copy = false;
|
||||
// copy on src stream
|
||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
@@ -3418,39 +3421,34 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
return false;
|
||||
#else
|
||||
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->ne[1] >= 32) {
|
||||
//
|
||||
// The goal here is to reduce traffic between GPU's, which is entirely non-negligible
|
||||
// for prompt processing.
|
||||
// We cast the tensor to be copied to f16, copy the f16 data peer-to-peer
|
||||
// and then cast back to f32 on the destination side.
|
||||
// The cost for converting to/from f16 is much ower than the cost of copying
|
||||
// The cost for converting to/from f16 is much lower than the cost of copying
|
||||
// two times more data over PCI-E (well, at least the 30 GB/s PCI-E I have).
|
||||
// iBut for some reason the following is not working.
|
||||
// But for some reason the following is slower.
|
||||
// Can somebody tell me why?
|
||||
//
|
||||
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool(), ggml_nelements(src));
|
||||
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst));
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx_dst->device);
|
||||
tmp_dst.alloc(ggml_nelements(dst));
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
tmp_src.alloc(ggml_nelements(src));
|
||||
|
||||
auto src_f16 = *src;
|
||||
src_f16.type = GGML_TYPE_F16;
|
||||
for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2;
|
||||
src_f16.data = tmp_src.get();
|
||||
|
||||
auto dst_f16 = *dst;
|
||||
dst_f16.type = GGML_TYPE_F16;
|
||||
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
|
||||
dst_f16.data = tmp_dst.get();
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true);
|
||||
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
|
||||
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream()));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(tmp_dst.ptr, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&src_f16), cuda_ctx_src->stream()));
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx_dst->device);
|
||||
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream()));
|
||||
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
|
||||
needs_f16_f32_copy = true;
|
||||
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
||||
@@ -3467,7 +3465,15 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
|
||||
|
||||
// wait on dst stream for the copy to complete
|
||||
ggml_cuda_set_device(cuda_ctx_dst->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
|
||||
if (needs_f16_f32_copy) {
|
||||
auto dst_f16 = *dst;
|
||||
dst_f16.type = GGML_TYPE_F16;
|
||||
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
|
||||
dst_f16.data = tmp_dst.get();
|
||||
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
|
||||
}
|
||||
} else {
|
||||
// src and dst are on the same backend
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
|
||||
@@ -321,6 +321,15 @@ static __global__ void k_fast_add(int64_t ne0, int64_t nelem, const float * x, c
|
||||
z[i] = x[i] + y[i % ne0];
|
||||
}
|
||||
|
||||
template <typename src1_t, typename src2_t, typename dst_t>
|
||||
static __global__ void k_fast_add_2(int64_t ne0, int64_t nelem, const src1_t * x, const src2_t * y, dst_t * z) {
|
||||
int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (i >= nelem) {
|
||||
return;
|
||||
}
|
||||
z[i] = (dst_t)((float)x[i] + (float)y[i]);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
if (ggml_nrows(dst->src[1]) == 1 && dst->src[0]->ne[0] == dst->src[1]->ne[0] &&
|
||||
dst->type == GGML_TYPE_F32 && dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32 &&
|
||||
@@ -332,6 +341,45 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
(const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data);
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst->src[0]) && ggml_are_same_shape(dst->src[0], dst->src[1]) && ggml_is_contiguous(dst)) {
|
||||
constexpr int kBlockSize = 256;
|
||||
auto nelem = ggml_nelements(dst);
|
||||
int nblocks = (nelem + kBlockSize - 1)/kBlockSize;
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) {
|
||||
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
|
||||
(const half *)dst->src[0]->data, (const half *)dst->src[1]->data, (half *)dst->data);
|
||||
}
|
||||
else if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F32) {
|
||||
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
|
||||
(const half *)dst->src[0]->data, (const float *)dst->src[1]->data, (half *)dst->data);
|
||||
}
|
||||
else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32) {
|
||||
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
|
||||
(const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (half *)dst->data);
|
||||
} else {
|
||||
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
|
||||
(const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (half *)dst->data);
|
||||
}
|
||||
} else {
|
||||
if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) {
|
||||
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
|
||||
(const half *)dst->src[0]->data, (const half *)dst->src[1]->data, (float *)dst->data);
|
||||
}
|
||||
else if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F32) {
|
||||
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
|
||||
(const half *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data);
|
||||
}
|
||||
else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32) {
|
||||
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
|
||||
(const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data);
|
||||
} else {
|
||||
k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
|
||||
(const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (float *)dst->data);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||
}
|
||||
|
||||
|
||||
@@ -542,7 +542,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
char ** dest_ptrs_d = nullptr;
|
||||
int graph_cpynode_index = -1;
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
if(!disable_indirection_for_this_node && ctx.cuda_graph && ctx.cuda_graph->use_cpy_indirection) {
|
||||
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
||||
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
||||
}
|
||||
@@ -651,7 +651,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
if(!disable_indirection_for_this_node && ctx.cuda_graph && ctx.cuda_graph->use_cpy_indirection) {
|
||||
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
||||
}
|
||||
#else
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
#include "norm.cuh"
|
||||
|
||||
template <int block_size>
|
||||
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
||||
template <int block_size, typename T>
|
||||
static __global__ void norm_f32(const T * x, float * dst, const int ncols, const float eps) {
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
float2 mean_var = make_float2(0.f, 0.f);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row*ncols + col];
|
||||
const float xi = (float)x[row*ncols + col];
|
||||
mean_var.x += xi;
|
||||
mean_var.y += xi * xi;
|
||||
}
|
||||
@@ -32,7 +32,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
|
||||
const float inv_std = rsqrtf(var + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
||||
dst[row*ncols + col] = (T)(((float)x[row*ncols + col] - mean) * inv_std);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -261,14 +261,15 @@ static __global__ void fused_rms_norm_f32_nc(
|
||||
}
|
||||
}
|
||||
|
||||
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||
template <typename T>
|
||||
static void norm_f32_cuda(const T * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||
norm_f32<WARP_SIZE, T><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||
norm_f32<1024, T><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -364,7 +365,7 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
@@ -373,7 +374,11 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
|
||||
} else {
|
||||
norm_f32_cuda((const half *)src0_d, dst_d, ne00, nrows, eps, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -7232,7 +7232,12 @@ static struct ggml_tensor * ggml_norm_impl(
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
if (inplace && a->type != GGML_TYPE_F32) {
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : a->type == GGML_TYPE_F32 ? ggml_dup_tensor(ctx, a)
|
||||
: ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], a->ne[1], a->ne[2], a->ne[3]);
|
||||
|
||||
ggml_set_op_params(result, &eps, sizeof(eps));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user