diff --git a/examples/server/server-common.h b/examples/server/server-common.h index 52d1e5b3..1b4b2acc 100644 --- a/examples/server/server-common.h +++ b/examples/server/server-common.h @@ -111,6 +111,16 @@ static T json_value(const json& body, const std::string& key, const T& default_v } } +// Control vector container for dynamic management +struct control_vector_container { + std::string path; + float scale; + int32_t layer_start; + int32_t layer_end; + llama_control_vector_data data; + bool applied; +}; + // thin wrapper around common_grammar_trigger with (de)serialization functions struct server_grammar_trigger { common_grammar_trigger value; diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index 3c4ff874..00c54d2b 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -1958,9 +1958,205 @@ void server_context::process_single_task(server_task&& task) { result.data = json{ { "success", true } }; queue_results.send(result); } break; + case SERVER_TASK_TYPE_LOAD_CONTROL_VECTOR: + { + // Load control vector from file + std::string path = task.data.at("path"); + float scale = task.data.value("scale", 1.0f); + int32_t layer_start = task.data.value("layer_start", 1); + int32_t layer_end = task.data.value("layer_end", llama_n_layer(model)); + + // Check if already loaded + int cv_id = -1; + for (size_t i = 0; i < control_vectors.size(); i++) { + if (control_vectors[i].path == path) { + control_vectors[i].scale = scale; + control_vectors[i].layer_start = layer_start; + control_vectors[i].layer_end = layer_end; + cv_id = i; + break; + } + } + + if (cv_id == -1) { + control_vector_container new_cv; + new_cv.path = path; + new_cv.scale = scale; + new_cv.layer_start = layer_start; + new_cv.layer_end = layer_end; + new_cv.applied = false; + + // Load the control vector data + llama_control_vector_load_info load_info; + load_info.fname = path; + load_info.strength = 1.0f; // Don't pre-scale here, we'll scale when applying + + std::vector load_infos = { load_info }; + new_cv.data = llama_control_vector_load(load_infos); + + if (new_cv.data.n_embd == -1) { + server_task_result result; + result.id = task.id; + result.error = true; + result.data = json{{ "success", false }, { "error", "Failed to load control vector from " + path }}; + queue_results.send(result); + break; + } + + // Validate dimension to prevent heap corruption + if (new_cv.data.n_embd != llama_model_n_embd(model)) { + server_task_result result; + result.id = task.id; + result.error = true; + result.data = json{{ "success", false }, + { "error", "Vector dimension mismatch" }}; + queue_results.send(result); + break; + } + + control_vectors.push_back(new_cv); + + cv_id = control_vectors.size() - 1; + } + + // Auto-apply control vectors after loading + if (!apply_control_vectors_internal()) { + server_task_result result; + result.id = task.id; + result.error = true; + result.data = json{{ "success", false }, { "error", "Failed to apply control vectors" }}; + queue_results.send(result); + break; + } + + server_task_result result; + result.id = task.id; + result.error = false; + result.data = json{{ "success", true }, { "id", cv_id }}; + queue_results.send(result); + } break; + case SERVER_TASK_TYPE_UNLOAD_CONTROL_VECTOR: + { + // Validate that "id" field exists and is a number + if (!task.data.contains("id") || task.data["id"].is_null() || !task.data["id"].is_number()) { + server_task_result result; + result.id = task.id; + result.error = true; + result.data = json{{ "success", false }, { "error", "Missing or invalid 'id' field" }}; + queue_results.send(result); + break; + } + + int id = task.data.at("id"); + + if (id < 0 || id >= (int)control_vectors.size()) { + server_task_result result; + result.id = task.id; + result.error = true; + result.data = json{{ "success", false }, { "error", "Invalid control vector ID" }}; + queue_results.send(result); + break; + } + + // Remove the control vector from the list + control_vectors.erase(control_vectors.begin() + id); + + // Reapply remaining control vectors + if (!apply_control_vectors_internal()) { + server_task_result result; + result.id = task.id; + result.error = true; + result.data = json{{ "success", false }, { "error", "Failed to apply control vectors" }}; + queue_results.send(result); + break; + } + + server_task_result result; + result.id = task.id; + result.error = false; + result.data = json{{ "success", true }}; + queue_results.send(result); + } break; + case SERVER_TASK_TYPE_SET_CONTROL_VECTOR: + { + if (!apply_control_vectors_internal()) { + server_task_result result; + result.id = task.id; + result.error = true; + result.data = json{{ "success", false }, { "error", "Failed to apply control vectors" }}; + queue_results.send(result); + break; + } + + server_task_result result; + result.id = task.id; + result.error = false; + result.data = json{{ "success", true }}; + queue_results.send(result); + } break; } } +bool server_context::apply_control_vectors_internal() { + llama_control_vector_data combined_cv = { -1, {} }; + + // Check if we have anything to apply + bool any_active = false; + for (const auto& cv : control_vectors) { + if (cv.scale != 0.0f) { + any_active = true; + break; + } + } + + if (!any_active) { + // Clear control vectors if nothing is active + llama_control_vector_apply(ctx, nullptr, 0, 0, 0, 0); + return true; + } + + // Aggregate control vectors with scaling + for (auto& cv : control_vectors) { + if (cv.scale == 0.0f) { + cv.applied = false; + continue; + } + + if (combined_cv.n_embd == -1) { + combined_cv.n_embd = cv.data.n_embd; + combined_cv.data.resize(cv.data.data.size(), 0.0f); + } + + for (size_t i = 0; i < cv.data.data.size(); i++) { + combined_cv.data[i] += cv.data.data[i] * cv.scale; + } + cv.applied = true; + } + + // Apply combined control vector + if (combined_cv.n_embd != -1 && !combined_cv.data.empty()) { + int32_t min_layer_start = INT32_MAX; + int32_t max_layer_end = 0; + + for (const auto& cv : control_vectors) { + if (cv.scale != 0.0f) { + min_layer_start = std::min(min_layer_start, cv.layer_start); + max_layer_end = std::max(max_layer_end, cv.layer_end); + } + } + + int err = llama_control_vector_apply(ctx, + combined_cv.data.data(), + combined_cv.data.size(), + combined_cv.n_embd, + min_layer_start, + max_layer_end); + return (err == 0); + } + + return true; +} + void server_context::on_finish_multitask(const server_task_multi& multitask) { // all subtasks done == multitask is done server_task_result result; diff --git a/examples/server/server-context.h b/examples/server/server-context.h index 34493565..4e52999a 100644 --- a/examples/server/server-context.h +++ b/examples/server/server-context.h @@ -183,6 +183,7 @@ struct server_context { llama_model* model = nullptr; llama_context* ctx = nullptr; std::vector lora_adapters; + std::vector control_vectors; gpt_params params_base; @@ -316,4 +317,7 @@ struct server_context { bool accept_special_token(const server_slot& slot, const llama_token token); json model_meta() const; + + // Re-aggregates all active vectors and updates the model state + bool apply_control_vectors_internal(); }; diff --git a/examples/server/server-task.h b/examples/server/server-task.h index 942097d3..1f4736f9 100644 --- a/examples/server/server-task.h +++ b/examples/server/server-task.h @@ -31,6 +31,9 @@ enum server_task_type { SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE, SERVER_TASK_TYPE_SET_LORA, + SERVER_TASK_TYPE_LOAD_CONTROL_VECTOR, + SERVER_TASK_TYPE_UNLOAD_CONTROL_VECTOR, + SERVER_TASK_TYPE_SET_CONTROL_VECTOR, }; enum oaicompat_type { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fa792f9e..ee8edd7b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1509,6 +1509,101 @@ int main(int argc, char ** argv) { res.status = 200; // HTTP OK }; + // Control vector handlers + const auto handle_control_vectors_list = [&](const httplib::Request & req, httplib::Response & res) { + json result = json::array(); + for (size_t i = 0; i < ctx_server.control_vectors.size(); ++i) { + auto & cv = ctx_server.control_vectors[i]; + result.push_back({ + {"id", i}, + {"path", cv.path}, + {"scale", cv.scale}, + {"layer_start", cv.layer_start}, + {"layer_end", cv.layer_end}, + {"applied", cv.applied}, + }); + } + res.set_content(result.dump(), "application/json"); + res.status = 200; // HTTP OK + }; + + const auto handle_control_vectors_load = [&](const httplib::Request & req, httplib::Response & res) { + const json body = json::parse(req.body); + + server_task task; + task.type = SERVER_TASK_TYPE_LOAD_CONTROL_VECTOR; + task.data = body; + + const int id_task = ctx_server.queue_tasks.post(std::move(task)); + ctx_server.queue_results.add_waiting_task_id(id_task); + + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); + + res.set_content(result.data.dump(), "application/json"); + res.status = result.error ? 400 : 200; + }; + + const auto handle_control_vectors_unload = [&](const httplib::Request & req, httplib::Response & res) { + const json body = json::parse(req.body); + + server_task task; + task.type = SERVER_TASK_TYPE_UNLOAD_CONTROL_VECTOR; + task.data = body; + + const int id_task = ctx_server.queue_tasks.post(std::move(task)); + ctx_server.queue_results.add_waiting_task_id(id_task); + + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); + + res.set_content(result.data.dump(), "application/json"); + res.status = result.error ? 400 : 200; + }; + + const auto handle_control_vectors_apply = [&](const httplib::Request & req, httplib::Response & res) { + const std::vector body = json::parse(req.body); + int max_idx = ctx_server.control_vectors.size(); + + // Update scales for existing control vectors + for (auto & cv : ctx_server.control_vectors) { + cv.scale = 0.0f; // Reset all scales first + } + + // Set new scales + for (auto entry : body) { + int id = entry.at("id"); + float scale = entry.at("scale"); + if (0 <= id && id < max_idx) { + ctx_server.control_vectors[id].scale = scale; + + // Optionally update layer range + if (entry.contains("layer_start")) { + ctx_server.control_vectors[id].layer_start = entry.at("layer_start"); + } + if (entry.contains("layer_end")) { + ctx_server.control_vectors[id].layer_end = entry.at("layer_end"); + } + } else { + res.set_content(json{{ "success", false }, { "error", "Invalid control vector id" }}.dump(), "application/json"); + res.status = 400; + return; + } + } + + server_task task; + task.type = SERVER_TASK_TYPE_SET_CONTROL_VECTOR; + + const int id_task = ctx_server.queue_tasks.post(std::move(task)); + ctx_server.queue_results.add_waiting_task_id(id_task); + + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); + + res.set_content(result.data.dump(), "application/json"); + res.status = result.error ? 400 : 200; + }; + const auto list_saved_prompts = [&ctx_server, ¶ms](const httplib::Request& req, httplib::Response& res) { json response = json::array(); @@ -1925,6 +2020,11 @@ int main(int argc, char ** argv) { // LoRA adapters hotswap svr->Get ("/lora-adapters", handle_lora_adapters_list); svr->Post("/lora-adapters", handle_lora_adapters_apply); + // Control vectors + svr->Get ("/control-vectors", handle_control_vectors_list); + svr->Post("/control-vectors/load", handle_control_vectors_load); + svr->Post("/control-vectors/unload", handle_control_vectors_unload); + svr->Post("/control-vectors/apply", handle_control_vectors_apply); // Save & load slots svr->Get ("/slots", handle_slots); svr->Get ("/slots/list", list_slot_prompts);