#include "spec-tuner.h" #include "ggml.h" #include "log.h" #include #include #include int spec_tuner_coord::find_nearest_arm(float value) const { int idx = 0; float best_dist = 1e30f; for (int i = 0; i < (int)arms.size(); i++) { float dist = std::fabs(arms[i].value - value); if (dist < best_dist) { best_dist = dist; idx = i; } } return idx; } int spec_tuner_coord::select_epsilon_greedy(double epsilon) const { static thread_local std::mt19937 rng(std::random_device{}()); std::uniform_real_distribution coin(0.0, 1.0); if (coin(rng) < epsilon) { std::uniform_int_distribution dist(0, (int)arms.size() - 1); return dist(rng); } return best_idx; } void spec_tuner_coord::update(double reward) { auto & arm = arms[current_idx]; arm.N += 1; arm.Q += (reward - arm.Q) / arm.N; double best_Q = -1e30; for (int i = 0; i < (int)arms.size(); i++) { if (arms[i].N > 0 && arms[i].Q > best_Q) { best_Q = arms[i].Q; best_idx = i; } } } void spec_tuner_coord::reset_scores() { for (auto & arm : arms) { arm.Q = 0.0; arm.N = 0; } current_idx = user_idx; best_idx = user_idx; } void spec_tuner_coord::build_grid_float(float lo, float hi, int n_points, float user_value) { arms.clear(); for (int i = 0; i < n_points; i++) { float v = lo + (hi - lo) * i / std::max(1, n_points - 1); arms.push_back({v, 0.0, 0}); } bool found = false; for (auto & a : arms) { if (std::fabs(a.value - user_value) < 1e-6f) { found = true; break; } } if (!found) { arms.push_back({user_value, 0.0, 0}); std::sort(arms.begin(), arms.end(), [](const spec_tuner_arm & a, const spec_tuner_arm & b) { return a.value < b.value; }); } } void spec_tuner_coord::build_grid_int(int lo, int hi, int step, int user_value) { arms.clear(); for (int v = lo; v <= hi; v += step) { arms.push_back({(float)v, 0.0, 0}); } if (arms.empty() || (int)arms.back().value != hi) { arms.push_back({(float)hi, 0.0, 0}); } bool found = false; for (auto & a : arms) { if ((int)a.value == user_value) { found = true; break; } } if (!found && user_value >= lo && user_value <= hi) { arms.push_back({(float)user_value, 0.0, 0}); std::sort(arms.begin(), arms.end(), [](const spec_tuner_arm & a, const spec_tuner_arm & b) { return a.value < b.value; }); } } void spec_tuner::reset_exploration() { n_resets++; LOG_DBG("Autotune task change detected (n_low=%d) — resetting MAB (reset #%d)\n", n_low, n_resets); for (auto & coord : coords) { coord.reset_scores(); } n_low = 0; cooldown = cooldown_max; step_ema = 0.0; n_calls = 0; } void spec_tuner::write_best(common_params_speculative & params) const { for (const auto & coord : coords) { float val = coord.arms[coord.best_idx].value; if (coord.name == "n_max") params.n_max = (int32_t)val; else if (coord.name == "p_min") params.p_min = val; else if (coord.name == "n_min") params.n_min = (int32_t)val; else if (coord.name == "ngram_size_n") params.ngram_size_n = (uint16_t)val; else if (coord.name == "ngram_size_m") params.ngram_size_m = (uint16_t)val; else if (coord.name == "ngram_min_hits") params.ngram_min_hits = (uint16_t)val; else if (coord.name == "suffix_min_match_len") params.suffix_min_match_len = (int32_t)val; } } void spec_tuner::init(common_speculative_type type, const common_params_speculative & user_params, const llama_model * model_tgt) { enabled = true; spec_type = type; coords.clear(); n_calls = 0; n_requests = 0; ema_tps = 0.0; step_ema = 0.0; n_low = 0; cooldown = 0; n_resets = 0; t_tuner_us = 0; last_n_drafted = 0; // all types get n_max // For simplicity we will create a fixed grid of possible values { spec_tuner_coord coord; coord.name = "n_max"; const bool recurrent_target = model_tgt != nullptr && llama_model_has_recurrent(model_tgt); int hi = recurrent_target ? std::max(1, (int) user_params.n_max) : std::max(16, (int) user_params.n_max); coord.build_grid_int(1, hi, 1, user_params.n_max); coords.push_back(std::move(coord)); } if (type == COMMON_SPECULATIVE_TYPE_DRAFT) { { spec_tuner_coord coord; coord.name = "p_min"; coord.build_grid_float(0.0f, 0.95f, 11, user_params.p_min); coords.push_back(std::move(coord)); } { spec_tuner_coord coord; coord.name = "n_min"; coord.build_grid_int(0, 6, 1, user_params.n_min); coords.push_back(std::move(coord)); } } if (type == COMMON_SPECULATIVE_TYPE_SUFFIX) { { spec_tuner_coord coord; coord.name = "p_min"; coord.build_grid_float(0.0f, 0.95f, 11, user_params.p_min); coords.push_back(std::move(coord)); } { spec_tuner_coord coord; coord.name = "suffix_min_match_len"; coord.build_grid_int(1, 12, 1, user_params.suffix_min_match_len); coords.push_back(std::move(coord)); } } // Ngram can change only n_max/n_min per call if (type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD) { { spec_tuner_coord coord; coord.name = "n_min"; int hi = std::max(0, std::min(4, (int)user_params.n_max - 1)); coord.build_grid_int(0, hi, 1, user_params.n_min); coords.push_back(std::move(coord)); } } for (auto & coord : coords) { float user_val = 0.0f; if (coord.name == "n_max") user_val = (float)user_params.n_max; else if (coord.name == "p_min") user_val = user_params.p_min; else if (coord.name == "n_min") user_val = (float)user_params.n_min; else if (coord.name == "ngram_size_n") user_val = (float)user_params.ngram_size_n; else if (coord.name == "ngram_size_m") user_val = (float)user_params.ngram_size_m; else if (coord.name == "ngram_min_hits") user_val = (float)user_params.ngram_min_hits; else if (coord.name == "suffix_min_match_len") user_val = (float)user_params.suffix_min_match_len; coord.user_idx = coord.find_nearest_arm(user_val); coord.best_idx = 0; coord.current_idx = 0; } LOG_DBG("Autotune ε-greedy (ε=%.2f) per-draft-call, reward=per-step TPS\n", epsilon); for (const auto & coord : coords) { std::ostringstream oss; oss << " " << coord.name << ": ["; for (size_t i = 0; i < coord.arms.size(); i++) { if (i > 0) oss << ", "; oss << coord.arms[i].value; } oss << "] (user=" << coord.arms[coord.user_idx].value << ")"; LOG_DBG("%s\n", oss.str().c_str()); } } void spec_tuner::propose(common_params_speculative & params) { int64_t t_start = ggml_time_us(); // always select fresh arm for every draft call for (auto & coord : coords) { coord.current_idx = coord.select_epsilon_greedy(epsilon); float val = coord.arms[coord.current_idx].value; if (coord.name == "n_max") params.n_max = (int32_t)val; else if (coord.name == "p_min") params.p_min = val; else if (coord.name == "n_min") params.n_min = (int32_t)val; else if (coord.name == "ngram_size_n") params.ngram_size_n = (uint16_t)val; else if (coord.name == "ngram_size_m") params.ngram_size_m = (uint16_t)val; else if (coord.name == "ngram_min_hits") params.ngram_min_hits = (uint16_t)val; else if (coord.name == "suffix_min_match_len") params.suffix_min_match_len = (int32_t)val; } enforce_constraints(params); t_tuner_us += (ggml_time_us() - t_start); } void spec_tuner::enforce_constraints(common_params_speculative & params) { if (params.n_min < 0) params.n_min = 0; if (params.n_max < 1) params.n_max = 1; if (params.n_min > params.n_max) params.n_min = params.n_max; if (params.p_min < 0.0f) params.p_min = 0.0f; if (params.p_min > 0.95f) params.p_min = 0.95f; if (params.ngram_size_n < 1) params.ngram_size_n = 1; if (params.ngram_size_m < 1) params.ngram_size_m = 1; if (params.ngram_min_hits < 1) params.ngram_min_hits = 1; } void spec_tuner::accept_feedback(int n_accepted, int n_drafted, double step_tps) { int64_t t_start = ggml_time_us(); n_calls++; // per-step TPS as reward: captures draft cost, verification cost, and acceptance benefit double reward = step_tps; for (auto & coord : coords) { coord.update(reward); } if (cooldown > 0) { cooldown--; if (step_ema <= 0.0) { step_ema = step_tps; } else { step_ema = step_ema_alpha * step_tps + (1.0 - step_ema_alpha) * step_ema; } } else if (step_ema <= 0.0) { step_ema = step_tps; } else { if (step_tps < step_ema * (1.0 - step_drop_pct)) { n_low++; if (n_low >= reset_after) { reset_exploration(); t_tuner_us += (ggml_time_us() - t_start); return; } } else { n_low = 0; } step_ema = step_ema_alpha * step_tps + (1.0 - step_ema_alpha) * step_ema; } if (n_calls <= 5 || (n_calls % log_every == 0)) { std::ostringstream oss; oss << "Autotune call=" << n_calls << " n_drafted=" << n_drafted << " n_accepted=" << n_accepted << " step_tps=" << std::fixed << std::setprecision(1) << step_tps << " ema=" << std::fixed << std::setprecision(1) << step_ema; for (const auto & coord : coords) { bool is_int = (coord.name != "p_min"); oss << " " << coord.name << "="; if (is_int) oss << (int)coord.arms[coord.current_idx].value; else oss << std::fixed << std::setprecision(2) << coord.arms[coord.current_idx].value; oss << "→best="; if (is_int) oss << (int)coord.arms[coord.best_idx].value; else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value; oss << "(Q=" << std::fixed << std::setprecision(1) << coord.arms[coord.best_idx].Q << ",N=" << coord.arms[coord.best_idx].N << ")"; } LOG_DBG("%s\n", oss.str().c_str()); } t_tuner_us += (ggml_time_us() - t_start); } void spec_tuner::end_of_request(double slot_tps, int n_past, common_params_speculative & active_params) { int64_t t_start = ggml_time_us(); n_requests++; GGML_UNUSED(n_past); if (ema_tps <= 0.0) { ema_tps = slot_tps; } else { ema_tps = ema_alpha * slot_tps + (1.0 - ema_alpha) * ema_tps; } write_best(active_params); enforce_constraints(active_params); t_tuner_us += (ggml_time_us() - t_start); print_best(); } void spec_tuner::print_best() const { { std::ostringstream oss; oss << "Autotune req=" << n_requests << " calls=" << n_calls << " tps=" << std::fixed << std::setprecision(2) << ema_tps; if (n_resets > 0) oss << " resets=" << n_resets; if (n_low > 0) oss << " n_low=" << n_low; oss << " best:"; for (const auto & coord : coords) { bool is_int = (coord.name != "p_min"); oss << " " << coord.name << "="; if (is_int) oss << (int)coord.arms[coord.best_idx].value; else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value; oss << "(Q=" << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].Q << ",N=" << coord.arms[coord.best_idx].N << ")"; } if (!coords.empty()) { oss << " | n_max arms:"; for (const auto & arm : coords[0].arms) { oss << " " << (int)arm.value << "(Q=" << std::fixed << std::setprecision(2) << arm.Q << ",N=" << arm.N << ")"; } } oss << " tuner=" << std::fixed << std::setprecision(3) << t_tuner_us / 1000.0 << "ms"; LOG_DBG("%s\n", oss.str().c_str()); } { std::ostringstream oss; oss << "Autotune reuse: "; for (const auto & coord : coords) { bool is_int = (coord.name != "p_min"); if (coord.name == "n_max") oss << "--draft-max "; else if (coord.name == "p_min") oss << "--draft-p-min "; else if (coord.name == "n_min") oss << "--draft-min "; else if (coord.name == "ngram_size_n") oss << "--spec-ngram-size-n "; else if (coord.name == "ngram_size_m") oss << "--spec-ngram-size-m "; else if (coord.name == "ngram_min_hits") oss << "--spec-ngram-min-hits "; else if (coord.name == "suffix_min_match_len") oss << "--suffix-pattern-len "; else oss << "--" << coord.name << " "; if (is_int) oss << (int)coord.arms[coord.best_idx].value << " "; else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value << " "; } LOG_INF("%s\n", oss.str().c_str()); } }