mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-05 19:40:19 +00:00
server: add dynamic control vector management endpoints (#1223)
This implements the ability to load, unload, and scale control vectors (representation engineering) mid-inference, following the existing task-queue pattern used by LoRA adapters. New Endpoints: - GET /control-vectors - POST /control-vectors/load - POST /control-vectors/unload - POST /control-vectors/apply (handles scaling) Technical Notes: - Centralizes vector aggregation logic to share implementation between load, unload, and apply tasks. - Vectors are applied globally to the model context. - Enforces dimension validation on load to safely reject incompatible vectors. Co-authored-by: Gapeleon <gapeleon@users.noreply.github.com>
This commit is contained in:
@@ -111,6 +111,16 @@ static T json_value(const json& body, const std::string& key, const T& default_v
|
||||
}
|
||||
}
|
||||
|
||||
// Control vector container for dynamic management
|
||||
struct control_vector_container {
|
||||
std::string path;
|
||||
float scale;
|
||||
int32_t layer_start;
|
||||
int32_t layer_end;
|
||||
llama_control_vector_data data;
|
||||
bool applied;
|
||||
};
|
||||
|
||||
// thin wrapper around common_grammar_trigger with (de)serialization functions
|
||||
struct server_grammar_trigger {
|
||||
common_grammar_trigger value;
|
||||
|
||||
@@ -1958,9 +1958,205 @@ void server_context::process_single_task(server_task&& task) {
|
||||
result.data = json{ { "success", true } };
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_LOAD_CONTROL_VECTOR:
|
||||
{
|
||||
// Load control vector from file
|
||||
std::string path = task.data.at("path");
|
||||
float scale = task.data.value("scale", 1.0f);
|
||||
int32_t layer_start = task.data.value("layer_start", 1);
|
||||
int32_t layer_end = task.data.value("layer_end", llama_n_layer(model));
|
||||
|
||||
// Check if already loaded
|
||||
int cv_id = -1;
|
||||
for (size_t i = 0; i < control_vectors.size(); i++) {
|
||||
if (control_vectors[i].path == path) {
|
||||
control_vectors[i].scale = scale;
|
||||
control_vectors[i].layer_start = layer_start;
|
||||
control_vectors[i].layer_end = layer_end;
|
||||
cv_id = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (cv_id == -1) {
|
||||
control_vector_container new_cv;
|
||||
new_cv.path = path;
|
||||
new_cv.scale = scale;
|
||||
new_cv.layer_start = layer_start;
|
||||
new_cv.layer_end = layer_end;
|
||||
new_cv.applied = false;
|
||||
|
||||
// Load the control vector data
|
||||
llama_control_vector_load_info load_info;
|
||||
load_info.fname = path;
|
||||
load_info.strength = 1.0f; // Don't pre-scale here, we'll scale when applying
|
||||
|
||||
std::vector<llama_control_vector_load_info> load_infos = { load_info };
|
||||
new_cv.data = llama_control_vector_load(load_infos);
|
||||
|
||||
if (new_cv.data.n_embd == -1) {
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = true;
|
||||
result.data = json{{ "success", false }, { "error", "Failed to load control vector from " + path }};
|
||||
queue_results.send(result);
|
||||
break;
|
||||
}
|
||||
|
||||
// Validate dimension to prevent heap corruption
|
||||
if (new_cv.data.n_embd != llama_model_n_embd(model)) {
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = true;
|
||||
result.data = json{{ "success", false },
|
||||
{ "error", "Vector dimension mismatch" }};
|
||||
queue_results.send(result);
|
||||
break;
|
||||
}
|
||||
|
||||
control_vectors.push_back(new_cv);
|
||||
|
||||
cv_id = control_vectors.size() - 1;
|
||||
}
|
||||
|
||||
// Auto-apply control vectors after loading
|
||||
if (!apply_control_vectors_internal()) {
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = true;
|
||||
result.data = json{{ "success", false }, { "error", "Failed to apply control vectors" }};
|
||||
queue_results.send(result);
|
||||
break;
|
||||
}
|
||||
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = false;
|
||||
result.data = json{{ "success", true }, { "id", cv_id }};
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_UNLOAD_CONTROL_VECTOR:
|
||||
{
|
||||
// Validate that "id" field exists and is a number
|
||||
if (!task.data.contains("id") || task.data["id"].is_null() || !task.data["id"].is_number()) {
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = true;
|
||||
result.data = json{{ "success", false }, { "error", "Missing or invalid 'id' field" }};
|
||||
queue_results.send(result);
|
||||
break;
|
||||
}
|
||||
|
||||
int id = task.data.at("id");
|
||||
|
||||
if (id < 0 || id >= (int)control_vectors.size()) {
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = true;
|
||||
result.data = json{{ "success", false }, { "error", "Invalid control vector ID" }};
|
||||
queue_results.send(result);
|
||||
break;
|
||||
}
|
||||
|
||||
// Remove the control vector from the list
|
||||
control_vectors.erase(control_vectors.begin() + id);
|
||||
|
||||
// Reapply remaining control vectors
|
||||
if (!apply_control_vectors_internal()) {
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = true;
|
||||
result.data = json{{ "success", false }, { "error", "Failed to apply control vectors" }};
|
||||
queue_results.send(result);
|
||||
break;
|
||||
}
|
||||
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = false;
|
||||
result.data = json{{ "success", true }};
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_SET_CONTROL_VECTOR:
|
||||
{
|
||||
if (!apply_control_vectors_internal()) {
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = true;
|
||||
result.data = json{{ "success", false }, { "error", "Failed to apply control vectors" }};
|
||||
queue_results.send(result);
|
||||
break;
|
||||
}
|
||||
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.error = false;
|
||||
result.data = json{{ "success", true }};
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
bool server_context::apply_control_vectors_internal() {
|
||||
llama_control_vector_data combined_cv = { -1, {} };
|
||||
|
||||
// Check if we have anything to apply
|
||||
bool any_active = false;
|
||||
for (const auto& cv : control_vectors) {
|
||||
if (cv.scale != 0.0f) {
|
||||
any_active = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!any_active) {
|
||||
// Clear control vectors if nothing is active
|
||||
llama_control_vector_apply(ctx, nullptr, 0, 0, 0, 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Aggregate control vectors with scaling
|
||||
for (auto& cv : control_vectors) {
|
||||
if (cv.scale == 0.0f) {
|
||||
cv.applied = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (combined_cv.n_embd == -1) {
|
||||
combined_cv.n_embd = cv.data.n_embd;
|
||||
combined_cv.data.resize(cv.data.data.size(), 0.0f);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < cv.data.data.size(); i++) {
|
||||
combined_cv.data[i] += cv.data.data[i] * cv.scale;
|
||||
}
|
||||
cv.applied = true;
|
||||
}
|
||||
|
||||
// Apply combined control vector
|
||||
if (combined_cv.n_embd != -1 && !combined_cv.data.empty()) {
|
||||
int32_t min_layer_start = INT32_MAX;
|
||||
int32_t max_layer_end = 0;
|
||||
|
||||
for (const auto& cv : control_vectors) {
|
||||
if (cv.scale != 0.0f) {
|
||||
min_layer_start = std::min(min_layer_start, cv.layer_start);
|
||||
max_layer_end = std::max(max_layer_end, cv.layer_end);
|
||||
}
|
||||
}
|
||||
|
||||
int err = llama_control_vector_apply(ctx,
|
||||
combined_cv.data.data(),
|
||||
combined_cv.data.size(),
|
||||
combined_cv.n_embd,
|
||||
min_layer_start,
|
||||
max_layer_end);
|
||||
return (err == 0);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void server_context::on_finish_multitask(const server_task_multi& multitask) {
|
||||
// all subtasks done == multitask is done
|
||||
server_task_result result;
|
||||
|
||||
@@ -183,6 +183,7 @@ struct server_context {
|
||||
llama_model* model = nullptr;
|
||||
llama_context* ctx = nullptr;
|
||||
std::vector<llama_lora_adapter_container> lora_adapters;
|
||||
std::vector<control_vector_container> control_vectors;
|
||||
|
||||
gpt_params params_base;
|
||||
|
||||
@@ -316,4 +317,7 @@ struct server_context {
|
||||
bool accept_special_token(const server_slot& slot, const llama_token token);
|
||||
|
||||
json model_meta() const;
|
||||
|
||||
// Re-aggregates all active vectors and updates the model state
|
||||
bool apply_control_vectors_internal();
|
||||
};
|
||||
|
||||
@@ -31,6 +31,9 @@ enum server_task_type {
|
||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||
SERVER_TASK_TYPE_SET_LORA,
|
||||
SERVER_TASK_TYPE_LOAD_CONTROL_VECTOR,
|
||||
SERVER_TASK_TYPE_UNLOAD_CONTROL_VECTOR,
|
||||
SERVER_TASK_TYPE_SET_CONTROL_VECTOR,
|
||||
};
|
||||
|
||||
enum oaicompat_type {
|
||||
|
||||
@@ -1509,6 +1509,101 @@ int main(int argc, char ** argv) {
|
||||
res.status = 200; // HTTP OK
|
||||
};
|
||||
|
||||
// Control vector handlers
|
||||
const auto handle_control_vectors_list = [&](const httplib::Request & req, httplib::Response & res) {
|
||||
json result = json::array();
|
||||
for (size_t i = 0; i < ctx_server.control_vectors.size(); ++i) {
|
||||
auto & cv = ctx_server.control_vectors[i];
|
||||
result.push_back({
|
||||
{"id", i},
|
||||
{"path", cv.path},
|
||||
{"scale", cv.scale},
|
||||
{"layer_start", cv.layer_start},
|
||||
{"layer_end", cv.layer_end},
|
||||
{"applied", cv.applied},
|
||||
});
|
||||
}
|
||||
res.set_content(result.dump(), "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
};
|
||||
|
||||
const auto handle_control_vectors_load = [&](const httplib::Request & req, httplib::Response & res) {
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_LOAD_CONTROL_VECTOR;
|
||||
task.data = body;
|
||||
|
||||
const int id_task = ctx_server.queue_tasks.post(std::move(task));
|
||||
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||
|
||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||
|
||||
res.set_content(result.data.dump(), "application/json");
|
||||
res.status = result.error ? 400 : 200;
|
||||
};
|
||||
|
||||
const auto handle_control_vectors_unload = [&](const httplib::Request & req, httplib::Response & res) {
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_UNLOAD_CONTROL_VECTOR;
|
||||
task.data = body;
|
||||
|
||||
const int id_task = ctx_server.queue_tasks.post(std::move(task));
|
||||
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||
|
||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||
|
||||
res.set_content(result.data.dump(), "application/json");
|
||||
res.status = result.error ? 400 : 200;
|
||||
};
|
||||
|
||||
const auto handle_control_vectors_apply = [&](const httplib::Request & req, httplib::Response & res) {
|
||||
const std::vector<json> body = json::parse(req.body);
|
||||
int max_idx = ctx_server.control_vectors.size();
|
||||
|
||||
// Update scales for existing control vectors
|
||||
for (auto & cv : ctx_server.control_vectors) {
|
||||
cv.scale = 0.0f; // Reset all scales first
|
||||
}
|
||||
|
||||
// Set new scales
|
||||
for (auto entry : body) {
|
||||
int id = entry.at("id");
|
||||
float scale = entry.at("scale");
|
||||
if (0 <= id && id < max_idx) {
|
||||
ctx_server.control_vectors[id].scale = scale;
|
||||
|
||||
// Optionally update layer range
|
||||
if (entry.contains("layer_start")) {
|
||||
ctx_server.control_vectors[id].layer_start = entry.at("layer_start");
|
||||
}
|
||||
if (entry.contains("layer_end")) {
|
||||
ctx_server.control_vectors[id].layer_end = entry.at("layer_end");
|
||||
}
|
||||
} else {
|
||||
res.set_content(json{{ "success", false }, { "error", "Invalid control vector id" }}.dump(), "application/json");
|
||||
res.status = 400;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_SET_CONTROL_VECTOR;
|
||||
|
||||
const int id_task = ctx_server.queue_tasks.post(std::move(task));
|
||||
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||
|
||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||
|
||||
res.set_content(result.data.dump(), "application/json");
|
||||
res.status = result.error ? 400 : 200;
|
||||
};
|
||||
|
||||
const auto list_saved_prompts = [&ctx_server, ¶ms](const httplib::Request& req, httplib::Response& res) {
|
||||
json response = json::array();
|
||||
|
||||
@@ -1925,6 +2020,11 @@ int main(int argc, char ** argv) {
|
||||
// LoRA adapters hotswap
|
||||
svr->Get ("/lora-adapters", handle_lora_adapters_list);
|
||||
svr->Post("/lora-adapters", handle_lora_adapters_apply);
|
||||
// Control vectors
|
||||
svr->Get ("/control-vectors", handle_control_vectors_list);
|
||||
svr->Post("/control-vectors/load", handle_control_vectors_load);
|
||||
svr->Post("/control-vectors/unload", handle_control_vectors_unload);
|
||||
svr->Post("/control-vectors/apply", handle_control_vectors_apply);
|
||||
// Save & load slots
|
||||
svr->Get ("/slots", handle_slots);
|
||||
svr->Get ("/slots/list", list_slot_prompts);
|
||||
|
||||
Reference in New Issue
Block a user