diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 0544a652..6ab058ac 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3263,8 +3263,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ggml_cuda_op_sum_rows_nc(ctx, cgraph->nodes[i+1]); i += 2; } else { - //auto src = dst->src[0]; - //printf("cont(%s -> %s): %ld x %ld x %ld x %ld; %zu x %zu x %zu x %zu\n", src->name, dst->name, src->ne[0], src->ne[1], src->ne[2], src->ne[3], src->nb[0], src->nb[1], src->nb[2], src->nb[3]); ggml_cuda_dup(ctx, dst); } break; diff --git a/src/llama-delta-net.cpp b/src/llama-delta-net.cpp index 8dc52429..6826e11c 100644 --- a/src/llama-delta-net.cpp +++ b/src/llama-delta-net.cpp @@ -636,9 +636,8 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_tok); ggml_tensor * attn_out_norm = llm_build_context::llm_build_norm(ctx0, attn_out_2d, hparams, model.layers[il].ssm_norm, nullptr, LLM_NORM_RMS, cb, il); - ggml_tensor * gated_silu = ggml_silu(ctx0, z_2d); - cb(gated_silu, "gated_silu", il); - attn_out_norm = ggml_mul(ctx0, attn_out_norm, gated_silu); + cb(attn_out_norm, "attn_rms_norm", il); + attn_out_norm = ggml_fused_mul_unary(ctx0, z_2d, attn_out_norm, GGML_UNARY_OP_SILU); cb(attn_out_norm, "attn_out_norm", il); ggml_tensor * final_output = ggml_reshape_2d(ctx0, attn_out_norm, value_dim, n_tok);