cuda: Remove unnecessary device to host copy of row ids

We get 3-4% TG speed improvement for DeepSeek-Lite just from that.
This commit is contained in:
Iwan Kawrakow
2025-05-10 09:49:08 +03:00
parent e5a4a3ce78
commit 10557832b1
2 changed files with 1 additions and 5 deletions

View File

@@ -2505,11 +2505,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
dst_padded_col_size, next->src[0]->type, stream);
CUDA_CHECK(cudaGetLastError());
std::vector<char> ids_host(ggml_nbytes(ids));
const char * ids_dev = (const char *) ids->data;
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
local_dst.ne[2] = 1;
auto local_next = *next;

View File

@@ -148,6 +148,7 @@ static __global__ void mul_mat_vec_q(
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0) {
int i2 = blockIdx.y;
int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2;
if (i02 < 0) return;
const char * cx = (const char *)vx + i02*nb02;
const char * cy = (const char *)vy + i2*nb12;
char * cdst = (char *)dst + i2*nb2;