diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index da11a427..c2cbb777 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2660,7 +2660,7 @@ static bool ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_te
             ggml_cuda_op_fused_mul_mat_vec_q_id(ctx, src0_1, &local_src1, ids, the_destination,
                     dst->src[4], dst->src[5],
                     (const char *)src0_1->data, (const char *)src0_2->data, (const float *)src1->data, src1_quantized.get(),
-                    (float *)dst_gate_contiguous.get(),
+                    (float *)local_dst.data,
                     0, src0_1->ne[1], 1, src1_padded_col_size, unary_op, stream);
             CUDA_CHECK(cudaGetLastError());
 
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index bd6758a9..a114e80a 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -738,7 +738,14 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
     GGML_ASSERT(ne10 % QK8_1 == 0);
     GGML_ASSERT(src0->ne[3] == 1 && src1->ne[3] == 1 && dst->ne[3] == 1);
     GGML_ASSERT(src1->ne[1] == 1 && src1->ne[2] == 1);
-    GGML_ASSERT(!ids || ids->ne[0] == dst->ne[2]);
+    if (ids && ids->ne[0] != dst->ne[2]) {
+        printf("%s(%s->%s): unexpected situation\n", __func__, src0->name, dst->name);
+        printf("  src0 = %ld x %ld x %ld x %ld\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+        printf("  src1 = %ld x %ld x %ld x %ld\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
+        printf("   ids = %ld x %ld x %ld x %ld\n", ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3]);
+        printf("   dst = %ld x %ld x %ld x %ld\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+        GGML_ABORT("Fatal error");
+    }
 
     const int64_t ne0 = dst->ne[0];