mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-02 10:00:07 +00:00
cuda: Remove unnecessary device to host copy of row ids
We get 3-4% TG speed improvement for DeepSeek-Lite just from that.
This commit is contained in:
@@ -2505,11 +2505,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
dst_padded_col_size, next->src[0]->type, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
const char * ids_dev = (const char *) ids->data;
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
local_dst.ne[2] = 1;
|
||||
|
||||
auto local_next = *next;
|
||||
|
||||
@@ -148,6 +148,7 @@ static __global__ void mul_mat_vec_q(
|
||||
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0) {
|
||||
int i2 = blockIdx.y;
|
||||
int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2;
|
||||
if (i02 < 0) return;
|
||||
const char * cx = (const char *)vx + i02*nb02;
|
||||
const char * cy = (const char *)vy + i2*nb12;
|
||||
char * cdst = (char *)dst + i2*nb2;
|
||||
|
||||
Reference in New Issue
Block a user