diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index ff6e064c..87f80d0c 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2505,11 +2505,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
                         dst_padded_col_size, next->src[0]->type, stream);
                 CUDA_CHECK(cudaGetLastError());
 
-                std::vector<char> ids_host(ggml_nbytes(ids));
-                const char * ids_dev = (const char *) ids->data;
-                CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
-                CUDA_CHECK(cudaStreamSynchronize(stream));
-
                 local_dst.ne[2] = 1;
 
                 auto local_next = *next;
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index f87ebb96..91fcf87c 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -148,6 +148,7 @@ static __global__ void mul_mat_vec_q(
     const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0) {
     int i2 = blockIdx.y;
     int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2;
+    if (i02 < 0) return;
     const char * cx = (const char *)vx + i02*nb02;
     const char * cy = (const char *)vy + i2*nb12;
     char * cdst = (char *)dst + i2*nb2;