From aa40ee491f1e4a72610540d7e6c780bd04b49095 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 22 May 2025 18:02:05 +0300 Subject: [PATCH] Synchronize after each mul_mat in up_gate so we know which one triggers the illegal memory access --- ggml/src/ggml-cuda.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 642cc421..f25892f9 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2748,6 +2748,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor dst_row.data = dst_up_contiguous.get(); ggml_cuda_mul_mat(ctx, &src0_1_row, &src1_row, &dst_row); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); dst_row.data = dst_gate_contiguous.get(); ggml_cuda_mul_mat(ctx, &src0_2_row, &src1_row, &dst_row); @@ -2756,6 +2757,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row), (const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get()); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); if (fuse_down) { @@ -2777,6 +2779,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor ggml_cuda_mul_mat(ctx, &final_src, &dst_row, &final_dst); //ggml_cuda_mul_mat(ctx, next->src[0], &dst_row, &final_dst); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); dim3 block_dims(std::min((unsigned int)next->ne[0], 768u)); dim3 grid_dims(num_src1_rows);