diff --git a/common/common.cpp b/common/common.cpp index 3192fd37..3a4d9a72 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -463,6 +464,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { return true; } +static void gpt_params_apply_runtime_env(const gpt_params & params) { + if (params.qwen3next_fused_delta < 0) { + return; + } + + const std::string value = std::to_string(params.qwen3next_fused_delta); +#if defined(_WIN32) + _putenv_s("LLAMA_QWEN3NEXT_FUSED_DELTA", value.c_str()); +#else + setenv("LLAMA_QWEN3NEXT_FUSED_DELTA", value.c_str(), 1); +#endif +} + void gpt_params_parse_from_env(gpt_params & params) { // we only care about server-related params for now get_env("LLAMA_ARG_MODEL", params.model); @@ -483,10 +497,13 @@ void gpt_params_parse_from_env(gpt_params & params) { get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots); get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); + get_env("LLAMA_ARG_QWEN3NEXT_FUSED_DELTA", params.qwen3next_fused_delta); get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching); get_env("LLAMA_ARG_HOST", params.hostname); get_env("LLAMA_ARG_PORT", params.port); + + gpt_params_apply_runtime_env(params); } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { @@ -504,6 +521,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return false; } + gpt_params_apply_runtime_env(params); return true; } @@ -1229,6 +1247,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } + if (arg == "--qwen3next-fused-delta") { + CHECK_ARG + params.qwen3next_fused_delta = std::stoi(argv[i]); + if (params.qwen3next_fused_delta < 0 || params.qwen3next_fused_delta > 2) { + fprintf(stderr, "error: Invalid value for --qwen3next-fused-delta: %d (must be 0, 1, or 2)\n", + params.qwen3next_fused_delta); + invalid_param = true; + } + return true; + } if (arg == "-mla" || arg == "--mla-use") { CHECK_ARG params.mla_attn = std::stoi(argv[i]); @@ -2153,6 +2181,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); options.push_back({ "*", "-no-fa, --no-flash-attn", "disable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); options.push_back({ "*", "-fa, --flash-attn (auto|on|off|0|1)", "set Flash Attention (default: %s)", params.flash_attn ? "on" : "off" }); + options.push_back({ "*", " --qwen3next-fused-delta {0,1,2}", + "force LLAMA_QWEN3NEXT_FUSED_DELTA mode for Qwen3Next (default: env/model default)" }); options.push_back({ "*", "-mla, --mla-use", "enable MLA (default: %d)", params.mla_attn }); options.push_back({ "*", "-amb, --attention-max-batch", "max batch size for attention computations (default: %d)", params.attn_max_batch}); options.push_back({ "*", "-no-fmoe, --no-fused-moe", "disable fused MoE (default: %s)", params.fused_moe_up_gate ? "enabled" : "disabled" }); @@ -4200,6 +4230,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); + fprintf(stream, "qwen3next_fused_delta: %d # default: -1 (keep env/model default)\n", params.qwen3next_fused_delta); fprintf(stream, "mla_attn: %d # default: 0\n", params.mla_attn); fprintf(stream, "attn_max_batch: %d # default: 0\n", params.attn_max_batch); fprintf(stream, "fused_moe: %s # default: false\n", params.fused_moe_up_gate ? "true" : "false"); diff --git a/common/common.h b/common/common.h index 1de82a6c..827fc1bb 100644 --- a/common/common.h +++ b/common/common.h @@ -259,6 +259,7 @@ struct gpt_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = true; // flash attention + int qwen3next_fused_delta = -1; // -1 keep env/default, otherwise force LLAMA_QWEN3NEXT_FUSED_DELTA={0,1,2} int mla_attn = 3; // MLA 0: standard, 1: MLA with K and V^T cache, 2: MLA with just K cache, 3: the best of both worlds int attn_max_batch = 0; // Max batch size to use when computing attention (only applicable if flash_attn = false) bool fused_moe_up_gate = true; // fused up*unary(gate) op for MoE models diff --git a/docs/development/qwen3next_perf_diff_report.md b/docs/development/qwen3next_perf_diff_report.md index cc2a9765..5885744e 100644 --- a/docs/development/qwen3next_perf_diff_report.md +++ b/docs/development/qwen3next_perf_diff_report.md @@ -36,10 +36,9 @@ Not directly mirrored yet (by design divergence from mainline model layout): ## Required Adjustments (remaining) 1. Keep non-fused as the strict safety baseline in defaults, and use `LLAMA_QWEN3NEXT_FUSED_DELTA=1` (prefill-only fused) as the explicit acceleration mode. -2. Add a first-class runtime flag/CLI plumb for Qwen3Next fused mode (`LLAMA_QWEN3NEXT_FUSED_DELTA`) so serving does not depend on raw env wiring. -3. Continue using `scripts/qwen3next-regression.sh` as the release gate for this model path, and wire it into CI or pre-merge checks. -4. Treat the remaining PR #19375 autoregressive rewrite as deferred: direct porting into current ik graph builder is not layout-compatible without broader contiguity/reshape refactoring. -5. Revisit PR #18792 (`src/models/delta.cpp`) only if we need unified GDA/KDA support for additional architectures; for Qwen3Next-only it is optional. +2. Continue using `scripts/qwen3next-regression.sh` as the release gate for this model path, and wire it into CI or pre-merge checks. +3. Treat the remaining PR #19375 autoregressive rewrite as deferred: direct porting into current ik graph builder is not layout-compatible without broader contiguity/reshape refactoring. +4. Revisit PR #18792 (`src/models/delta.cpp`) only if we need unified GDA/KDA support for additional architectures; for Qwen3Next-only it is optional. ## Strong Points of `ik_llama.cpp` to Preserve @@ -110,3 +109,6 @@ Relative (`ik` vs mainline): - Added unified Qwen3Next regression entrypoint for ongoing checks: - `scripts/qwen3next-regression.sh --model /path/to/qwen3-next-coder.gguf` - Outputs `SUMMARY.md` + per-step logs under `/tmp/qwen3next-regression//`. +- Added CLI plumbing for fused mode control (no raw env required): + - `--qwen3next-fused-delta {0|1|2}` + - This sets `LLAMA_QWEN3NEXT_FUSED_DELTA` for the current process.