diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index ff6e064c..87f80d0c 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2505,11 +2505,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor dst_padded_col_size, next->src[0]->type, stream); CUDA_CHECK(cudaGetLastError()); - std::vector ids_host(ggml_nbytes(ids)); - const char * ids_dev = (const char *) ids->data; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - local_dst.ne[2] = 1; auto local_next = *next; diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index f87ebb96..91fcf87c 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -148,6 +148,7 @@ static __global__ void mul_mat_vec_q( const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0) { int i2 = blockIdx.y; int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2; + if (i02 < 0) return; const char * cx = (const char *)vx + i02*nb02; const char * cy = (const char *)vy + i2*nb12; char * cdst = (char *)dst + i2*nb2;