Files
ik_llama.cpp/common/spec-tuner.cpp
Samuel Oliveira Alves 260622faf6 Self-decoding: Adds support for suffix decoding (#1646)
* speculative: implement suffix-tree decoder

* speculative: add support to cache and tuner
2026-04-18 16:10:10 +02:00

376 lines
14 KiB
C++

#include "spec-tuner.h"
#include "ggml.h"
#include "log.h"
#include <algorithm>
#include <iomanip>
#include <random>
int spec_tuner_coord::find_nearest_arm(float value) const {
int idx = 0;
float best_dist = 1e30f;
for (int i = 0; i < (int)arms.size(); i++) {
float dist = std::fabs(arms[i].value - value);
if (dist < best_dist) {
best_dist = dist;
idx = i;
}
}
return idx;
}
int spec_tuner_coord::select_epsilon_greedy(double epsilon) const {
static thread_local std::mt19937 rng(std::random_device{}());
std::uniform_real_distribution<double> coin(0.0, 1.0);
if (coin(rng) < epsilon) {
std::uniform_int_distribution<int> dist(0, (int)arms.size() - 1);
return dist(rng);
}
return best_idx;
}
void spec_tuner_coord::update(double reward) {
auto & arm = arms[current_idx];
arm.N += 1;
arm.Q += (reward - arm.Q) / arm.N;
double best_Q = -1e30;
for (int i = 0; i < (int)arms.size(); i++) {
if (arms[i].N > 0 && arms[i].Q > best_Q) {
best_Q = arms[i].Q;
best_idx = i;
}
}
}
void spec_tuner_coord::reset_scores() {
for (auto & arm : arms) {
arm.Q = 0.0;
arm.N = 0;
}
current_idx = user_idx;
best_idx = user_idx;
}
void spec_tuner_coord::build_grid_float(float lo, float hi, int n_points, float user_value) {
arms.clear();
for (int i = 0; i < n_points; i++) {
float v = lo + (hi - lo) * i / std::max(1, n_points - 1);
arms.push_back({v, 0.0, 0});
}
bool found = false;
for (auto & a : arms) {
if (std::fabs(a.value - user_value) < 1e-6f) { found = true; break; }
}
if (!found) {
arms.push_back({user_value, 0.0, 0});
std::sort(arms.begin(), arms.end(), [](const spec_tuner_arm & a, const spec_tuner_arm & b) {
return a.value < b.value;
});
}
}
void spec_tuner_coord::build_grid_int(int lo, int hi, int step, int user_value) {
arms.clear();
for (int v = lo; v <= hi; v += step) {
arms.push_back({(float)v, 0.0, 0});
}
if (arms.empty() || (int)arms.back().value != hi) {
arms.push_back({(float)hi, 0.0, 0});
}
bool found = false;
for (auto & a : arms) {
if ((int)a.value == user_value) { found = true; break; }
}
if (!found && user_value >= lo && user_value <= hi) {
arms.push_back({(float)user_value, 0.0, 0});
std::sort(arms.begin(), arms.end(), [](const spec_tuner_arm & a, const spec_tuner_arm & b) {
return a.value < b.value;
});
}
}
void spec_tuner::reset_exploration() {
n_resets++;
LOG_DBG("Autotune task change detected (n_low=%d) — resetting MAB (reset #%d)\n", n_low, n_resets);
for (auto & coord : coords) {
coord.reset_scores();
}
n_low = 0;
cooldown = cooldown_max;
step_ema = 0.0;
n_calls = 0;
}
void spec_tuner::write_best(common_params_speculative & params) const {
for (const auto & coord : coords) {
float val = coord.arms[coord.best_idx].value;
if (coord.name == "n_max") params.n_max = (int32_t)val;
else if (coord.name == "p_min") params.p_min = val;
else if (coord.name == "n_min") params.n_min = (int32_t)val;
else if (coord.name == "ngram_size_n") params.ngram_size_n = (uint16_t)val;
else if (coord.name == "ngram_size_m") params.ngram_size_m = (uint16_t)val;
else if (coord.name == "ngram_min_hits") params.ngram_min_hits = (uint16_t)val;
else if (coord.name == "suffix_min_match_len") params.suffix_min_match_len = (int32_t)val;
}
}
void spec_tuner::init(common_speculative_type type, const common_params_speculative & user_params) {
enabled = true;
spec_type = type;
coords.clear();
n_calls = 0;
n_requests = 0;
ema_tps = 0.0;
step_ema = 0.0;
n_low = 0;
cooldown = 0;
n_resets = 0;
t_tuner_us = 0;
last_n_drafted = 0;
// all types get n_max
// For simplicity we will create a fixed grid of possible values
{
spec_tuner_coord coord;
coord.name = "n_max";
int hi = std::max(16, (int)user_params.n_max);
coord.build_grid_int(1, hi, 1, user_params.n_max);
coords.push_back(std::move(coord));
}
if (type == COMMON_SPECULATIVE_TYPE_DRAFT) {
{
spec_tuner_coord coord;
coord.name = "p_min";
coord.build_grid_float(0.0f, 0.95f, 11, user_params.p_min);
coords.push_back(std::move(coord));
}
{
spec_tuner_coord coord;
coord.name = "n_min";
coord.build_grid_int(0, 6, 1, user_params.n_min);
coords.push_back(std::move(coord));
}
}
if (type == COMMON_SPECULATIVE_TYPE_SUFFIX) {
{
spec_tuner_coord coord;
coord.name = "p_min";
coord.build_grid_float(0.0f, 0.95f, 11, user_params.p_min);
coords.push_back(std::move(coord));
}
{
spec_tuner_coord coord;
coord.name = "suffix_min_match_len";
coord.build_grid_int(1, 12, 1, user_params.suffix_min_match_len);
coords.push_back(std::move(coord));
}
}
// Ngram can change only n_max/n_min per call
if (type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD) {
{
spec_tuner_coord coord;
coord.name = "n_min";
int hi = std::max(0, std::min(4, (int)user_params.n_max - 1));
coord.build_grid_int(0, hi, 1, user_params.n_min);
coords.push_back(std::move(coord));
}
}
for (auto & coord : coords) {
float user_val = 0.0f;
if (coord.name == "n_max") user_val = (float)user_params.n_max;
else if (coord.name == "p_min") user_val = user_params.p_min;
else if (coord.name == "n_min") user_val = (float)user_params.n_min;
else if (coord.name == "ngram_size_n") user_val = (float)user_params.ngram_size_n;
else if (coord.name == "ngram_size_m") user_val = (float)user_params.ngram_size_m;
else if (coord.name == "ngram_min_hits") user_val = (float)user_params.ngram_min_hits;
else if (coord.name == "suffix_min_match_len") user_val = (float)user_params.suffix_min_match_len;
coord.user_idx = coord.find_nearest_arm(user_val);
coord.best_idx = 0;
coord.current_idx = 0;
}
LOG_DBG("Autotune ε-greedy (ε=%.2f) per-draft-call, reward=per-step TPS\n", epsilon);
for (const auto & coord : coords) {
std::ostringstream oss;
oss << " " << coord.name << ": [";
for (size_t i = 0; i < coord.arms.size(); i++) {
if (i > 0) oss << ", ";
oss << coord.arms[i].value;
}
oss << "] (user=" << coord.arms[coord.user_idx].value << ")";
LOG_DBG("%s\n", oss.str().c_str());
}
}
void spec_tuner::propose(common_params_speculative & params) {
int64_t t_start = ggml_time_us();
// always select fresh arm for every draft call
for (auto & coord : coords) {
coord.current_idx = coord.select_epsilon_greedy(epsilon);
float val = coord.arms[coord.current_idx].value;
if (coord.name == "n_max") params.n_max = (int32_t)val;
else if (coord.name == "p_min") params.p_min = val;
else if (coord.name == "n_min") params.n_min = (int32_t)val;
else if (coord.name == "ngram_size_n") params.ngram_size_n = (uint16_t)val;
else if (coord.name == "ngram_size_m") params.ngram_size_m = (uint16_t)val;
else if (coord.name == "ngram_min_hits") params.ngram_min_hits = (uint16_t)val;
else if (coord.name == "suffix_min_match_len") params.suffix_min_match_len = (int32_t)val;
}
enforce_constraints(params);
t_tuner_us += (ggml_time_us() - t_start);
}
void spec_tuner::enforce_constraints(common_params_speculative & params) {
if (params.n_min < 0) params.n_min = 0;
if (params.n_max < 1) params.n_max = 1;
if (params.n_min > params.n_max) params.n_min = params.n_max;
if (params.p_min < 0.0f) params.p_min = 0.0f;
if (params.p_min > 0.95f) params.p_min = 0.95f;
if (params.ngram_size_n < 1) params.ngram_size_n = 1;
if (params.ngram_size_m < 1) params.ngram_size_m = 1;
if (params.ngram_min_hits < 1) params.ngram_min_hits = 1;
}
void spec_tuner::accept_feedback(int n_accepted, int n_drafted, double step_tps) {
int64_t t_start = ggml_time_us();
n_calls++;
// per-step TPS as reward: captures draft cost, verification cost, and acceptance benefit
double reward = step_tps;
for (auto & coord : coords) {
coord.update(reward);
}
if (cooldown > 0) {
cooldown--;
if (step_ema <= 0.0) {
step_ema = step_tps;
} else {
step_ema = step_ema_alpha * step_tps + (1.0 - step_ema_alpha) * step_ema;
}
} else if (step_ema <= 0.0) {
step_ema = step_tps;
} else {
if (step_tps < step_ema * (1.0 - step_drop_pct)) {
n_low++;
if (n_low >= reset_after) {
reset_exploration();
t_tuner_us += (ggml_time_us() - t_start);
return;
}
} else {
n_low = 0;
}
step_ema = step_ema_alpha * step_tps + (1.0 - step_ema_alpha) * step_ema;
}
if (n_calls <= 5 || (n_calls % log_every == 0)) {
std::ostringstream oss;
oss << "Autotune call=" << n_calls
<< " n_drafted=" << n_drafted
<< " n_accepted=" << n_accepted
<< " step_tps=" << std::fixed << std::setprecision(1) << step_tps
<< " ema=" << std::fixed << std::setprecision(1) << step_ema;
for (const auto & coord : coords) {
bool is_int = (coord.name != "p_min");
oss << " " << coord.name << "=";
if (is_int) oss << (int)coord.arms[coord.current_idx].value;
else oss << std::fixed << std::setprecision(2) << coord.arms[coord.current_idx].value;
oss << "→best=";
if (is_int) oss << (int)coord.arms[coord.best_idx].value;
else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value;
oss << "(Q=" << std::fixed << std::setprecision(1) << coord.arms[coord.best_idx].Q
<< ",N=" << coord.arms[coord.best_idx].N << ")";
}
LOG_DBG("%s\n", oss.str().c_str());
}
t_tuner_us += (ggml_time_us() - t_start);
}
void spec_tuner::end_of_request(double slot_tps, int n_past, common_params_speculative & active_params) {
int64_t t_start = ggml_time_us();
n_requests++;
GGML_UNUSED(n_past);
if (ema_tps <= 0.0) {
ema_tps = slot_tps;
} else {
ema_tps = ema_alpha * slot_tps + (1.0 - ema_alpha) * ema_tps;
}
write_best(active_params);
enforce_constraints(active_params);
t_tuner_us += (ggml_time_us() - t_start);
print_best();
}
void spec_tuner::print_best() const {
{
std::ostringstream oss;
oss << "Autotune req=" << n_requests
<< " calls=" << n_calls
<< " tps=" << std::fixed << std::setprecision(2) << ema_tps;
if (n_resets > 0) oss << " resets=" << n_resets;
if (n_low > 0) oss << " n_low=" << n_low;
oss << " best:";
for (const auto & coord : coords) {
bool is_int = (coord.name != "p_min");
oss << " " << coord.name << "=";
if (is_int) oss << (int)coord.arms[coord.best_idx].value;
else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value;
oss << "(Q=" << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].Q
<< ",N=" << coord.arms[coord.best_idx].N << ")";
}
if (!coords.empty()) {
oss << " | n_max arms:";
for (const auto & arm : coords[0].arms) {
oss << " " << (int)arm.value << "(Q=" << std::fixed << std::setprecision(2) << arm.Q
<< ",N=" << arm.N << ")";
}
}
oss << " tuner=" << std::fixed << std::setprecision(3) << t_tuner_us / 1000.0 << "ms";
LOG_DBG("%s\n", oss.str().c_str());
}
{
std::ostringstream oss;
oss << "Autotune reuse: ";
for (const auto & coord : coords) {
bool is_int = (coord.name != "p_min");
if (coord.name == "n_max") oss << "--draft-max ";
else if (coord.name == "p_min") oss << "--draft-p-min ";
else if (coord.name == "n_min") oss << "--draft-min ";
else if (coord.name == "ngram_size_n") oss << "--spec-ngram-size-n ";
else if (coord.name == "ngram_size_m") oss << "--spec-ngram-size-m ";
else if (coord.name == "ngram_min_hits") oss << "--spec-ngram-min-hits ";
else if (coord.name == "suffix_min_match_len") oss << "--suffix-pattern-len ";
else oss << "--" << coord.name << " ";
if (is_int) oss << (int)coord.arms[coord.best_idx].value << " ";
else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value << " ";
}
LOG_INF("%s\n", oss.str().c_str());
}
}