cuda: Remove unnecessary device to host copy of row ids

We get 3-4% TG speed improvement for DeepSeek-Lite just from that.
2026-03-02 10:00:07 +00:00 · 2025-05-10 09:49:08 +03:00
parent e5a4a3ce78
commit 10557832b1
2 changed files with 1 additions and 5 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2505,11 +2505,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
                        dst_padded_col_size, next->src[0]->type, stream);
                CUDA_CHECK(cudaGetLastError());

-                std::vector<char> ids_host(ggml_nbytes(ids));
-                const char * ids_dev = (const char *) ids->data;
-                CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
-                CUDA_CHECK(cudaStreamSynchronize(stream));
-
                local_dst.ne[2] = 1;

                auto local_next = *next;
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -148,6 +148,7 @@ static __global__ void mul_mat_vec_q(
    const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0) {
    int i2 = blockIdx.y;
    int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2;
+    if (i02 < 0) return;
    const char * cx = (const char *)vx + i02*nb02;
    const char * cy = (const char *)vy + i2*nb12;
    char * cdst = (char *)dst + i2*nb2;