diff --git a/common/common.cpp b/common/common.cpp index d0f38ed3..b9780ef7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1390,7 +1390,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } - if (arg == "--offload-only-active-experts" || arg == "-ooae") { + if (arg == "--no-offload-only-active-experts" || arg == "-no-ooae") { params.only_active_exps = true; return true; } diff --git a/common/common.h b/common/common.h index ddd50755..17628b98 100644 --- a/common/common.h +++ b/common/common.h @@ -255,7 +255,7 @@ struct gpt_params { bool repack_tensors = false; // repack tensors if interleaved variant is available bool use_thp = false; // use transparent huge pages (linux only) bool validate_quants = false; // if true, check for NaNs while loading the model - bool only_active_exps = false; // if true, offload only active experts (relevant only for hybrid CPU/GPU) + bool only_active_exps = true; // if true, offload only active experts (relevant only for hybrid CPU/GPU) std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index c3015134..9a6a27f1 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -264,6 +264,7 @@ struct cmd_params { bool ger = false; // ger = Grouped Expert Routing bool no_fug = false; bool use_thp = false; + bool no_ooae = false; output_formats output_format; output_formats output_format_stderr; }; @@ -301,6 +302,7 @@ static const cmd_params cmd_params_defaults = { /* ger */ false, /* no_fug */ false, /* use_thp */ false, + /* no_ooae */ false, /* output_format */ MARKDOWN, /* output_format_stderr */ NONE, }; @@ -345,6 +347,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -fmoe, --fused-moe <0|1> (default: %s)\n", cmd_params_defaults.fmoe? "1" : "0"); printf(" -ger, --grouped-expert-routing <0|1>(default: %s)\n", cmd_params_defaults.ger ? "1" : "0"); printf(" -no-fug, --no-fused-up-gate <0|1> (default: %s)\n", cmd_params_defaults.no_fug? "1" : "0"); + printf(" -no-ooae, --no-offload-only-active-experts <0|1> (default: %s)\n", cmd_params_defaults.no_ooae? "1" : "0"); printf("\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); } @@ -754,6 +757,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.no_fug = std::stoi(argv[i]); + } else if (arg == "-no-ooae" || arg == "--no-offload-only-active-experts") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.no_ooae = std::stoi(argv[i]); } else if (arg == "-ot" || arg == "--override-tensor") { if (++i >= argc) { invalid_param = true; @@ -841,6 +850,7 @@ struct cmd_params_instance { bool ger = false; bool no_fug = false; bool use_thp = false; + bool no_ooae = false; const llama_model_tensor_buft_override* buft_overrides; llama_model_params to_llama_mparams() const { @@ -888,6 +898,7 @@ struct cmd_params_instance { cparams.fused_moe_up_gate = fmoe; cparams.grouped_expert_routing = ger; cparams.fused_up_gate = !no_fug; + cparams.only_active_experts = !no_ooae; cparams.min_experts = ser.first; cparams.thresh_experts = ser.second; cparams.embeddings = embeddings; @@ -949,6 +960,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .ger = */ params.ger, /* .no_fug = */ params.no_fug, /* .use_thp = */ params.use_thp, + /* .no_ooae = */ params.no_ooae, /* .buft_overrides=*/ params.buft_overrides.data(), }; instances.push_back(instance); @@ -985,6 +997,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .ger = */ params.ger, /* .no_fug = */ params.no_fug, /* .use_thp = */ params.use_thp, + /* .no_ooae = */ params.no_ooae, /* .buft_overrides=*/ params.buft_overrides.data(), }; instances.push_back(instance); @@ -1021,6 +1034,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .ger = */ params.ger, /* .no_fug = */ params.no_fug, /* .use_thp = */ params.use_thp, + /* .no_ooae = */ params.no_ooae, /* .buft_overrides=*/ params.buft_overrides.data(), }; instances.push_back(instance); @@ -1057,6 +1071,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .ger = */ params.ger, /* .no_fug = */ params.no_fug, /* .use_thp = */ params.use_thp, + /* .no_ooae = */ params.no_ooae, /* .buft_overrides=*/ params.buft_overrides.data(), }; instances.push_back(instance); @@ -1104,6 +1119,7 @@ struct test { bool ger = false; bool no_fug = false; bool use_thp = false; + bool no_ooae = false; int n_prompt; int n_gen; std::string test_time; @@ -1140,6 +1156,7 @@ struct test { ger = inst.ger; no_fug = inst.no_fug; use_thp = inst.use_thp; + no_ooae = inst.no_ooae; n_prompt = inst.n_prompt; n_gen = inst.n_gen; test_kind = inst.test_kind; @@ -1230,7 +1247,7 @@ struct test { "n_threads", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "mla_attn", "attn_max_batch", "ser", - "tensor_split", "use_mmap", "embeddings", "repack", "fused_moe", "grouped_er", "fused_up_gate", "use_thp", + "tensor_split", "use_mmap", "embeddings", "repack", "fused_moe", "grouped_er", "fused_up_gate", "use_thp", "ooae", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "test", @@ -1252,7 +1269,7 @@ struct test { if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" || - field == "fused_moe" || field == "grouped_er" || field == "fused_up_gate") { + field == "fused_moe" || field == "grouped_er" || field == "fused_up_gate" || field == "ooae") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -1296,7 +1313,7 @@ struct test { std::to_string(mla_attn), std::to_string(attn_max_batch), ser_to_string(ser), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), std::to_string(repack), std::to_string(fmoe), std::to_string(ger), - std::to_string(no_fug), std::to_string(use_thp), + std::to_string(no_fug), std::to_string(use_thp), std::to_string(no_ooae), std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()), @@ -1486,6 +1503,9 @@ struct markdown_printer : public printer { if (field == "fused_up_gate") { return 6; } + if (field == "ooae") { + return 7; + } if (field == "test") { return 13; } @@ -1544,6 +1564,9 @@ struct markdown_printer : public printer { if (field == "fused_up_gate") { return "no-fug"; } + if (field == "ooae") { + return "no-ooae"; + } if (field == "embeddings") { return "embd"; } @@ -1623,6 +1646,9 @@ struct markdown_printer : public printer { if (params.no_fug != cmd_params_defaults.no_fug) { fields.emplace_back("fused_up_gate"); } + if (params.no_ooae != cmd_params_defaults.no_ooae) { + fields.emplace_back("ooae"); + } fields.emplace_back("test"); fields.emplace_back("t/s");