Fused mul + multi_add op (#858)

* Adding fused mul+multi_add + CPU implementation

* fused mul+multi_add: CUDA

* fused mul+multi_add: command line argument to disable it

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-10-24 07:40:35 +03:00
committed by GitHub
parent 483cea527d
commit db3ba4999f
15 changed files with 211 additions and 38 deletions

View File

@@ -235,6 +235,7 @@ struct gpt_params {
int attn_max_batch = 0; // Max batch size to use when computing attention (only applicable if flash_attn = false)
bool fused_moe_up_gate = false; // fused up*unary(gate) op for MoE models
bool fused_up_gate = true; // fused up*unary(gate) op
bool fused_mmad = true; // fused mul+multi_add op
bool grouped_expert_routing = false; // if to use grouped expert routing (BailingMoeV2 arch)
int min_experts = -1;
float thresh_experts = 0;