Merge branch 'main' into fcp/string_ban

2026-03-12 15:00:11 +00:00 · 2026-02-04 21:56:08 -06:00
parent 4f3f1be6bd a335cff664
commit 7470d8bf50
40 changed files with 1283 additions and 1023 deletions
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -1,6 +1,6 @@
 #define LLAMA_API_INTERNAL

-#include "grammar-parser.h"
+#include "llama-grammar.h"
 #include "ggml.h"
 #include "llama.h"
 #include "unicode.h"
@@ -77,27 +77,30 @@ int main(int argc, char** argv) {
        grammar_str = buffer.str();
    }

+
    // Parse the GBNF grammar
-    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+    llama_grammar_parser parser;
+    auto parsed_grammar = parser.parse(grammar_str.c_str());

    // will be empty (default) if there are parse errors
-    if (parsed_grammar.rules.empty()) {
-        fprintf(stdout, "%s: failed to parse grammar\n", __func__);
+    if (!parser.parse(grammar_str.c_str()) || parser.rules.empty()) {
+        fprintf(stderr, "%s: failed to parse grammar\n", __func__);
        return 1;
    }

    // Ensure that there is a "root" node.
-    if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
-        fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
+    if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
+        fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
        return 1;
    }

-    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+    std::vector<const llama_grammar_element*> grammar_rules(parser.c_rules());

    // Create the LLAMA grammar
-    auto grammar = llama_grammar_init(
+    auto grammar = llama_grammar_init_impl(
            grammar_rules.data(),
-            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+            grammar_rules.size(), parser.symbol_ids.at("root"));
+
    if (grammar == nullptr) {
        throw std::runtime_error("Failed to initialize llama_grammar");
    }
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -2,7 +2,7 @@

 #include "console.h"
 #include "llama.h"
-#include "grammar-parser.h"
+#include "llama-grammar.h"

 #include <cassert>
 #include <cinttypes>
--- a/examples/server/server-common.h
+++ b/examples/server/server-common.h
@@ -112,6 +112,16 @@ static T json_value(const json& body, const std::string& key, const T& default_v
    }
 }

+// Control vector container for dynamic management
+struct control_vector_container {
+    std::string path;
+    float scale;
+    int32_t layer_start;
+    int32_t layer_end;
+    llama_control_vector_data data;
+    bool applied;
+};
+
 // thin wrapper around common_grammar_trigger with (de)serialization functions
 struct server_grammar_trigger {
    common_grammar_trigger value;
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@@ -2067,9 +2067,205 @@ void server_context::process_single_task(server_task&& task) {
        result.data = json{ { "success", true } };
        queue_results.send(result);
    } break;
+    case SERVER_TASK_TYPE_LOAD_CONTROL_VECTOR:
+    {
+        // Load control vector from file
+        std::string path = task.data.at("path");
+        float scale = task.data.value("scale", 1.0f);
+        int32_t layer_start = task.data.value("layer_start", 1);
+        int32_t layer_end = task.data.value("layer_end", llama_n_layer(model));
+
+        // Check if already loaded
+        int cv_id = -1;
+        for (size_t i = 0; i < control_vectors.size(); i++) {
+            if (control_vectors[i].path == path) {
+                control_vectors[i].scale = scale;
+                control_vectors[i].layer_start = layer_start;
+                control_vectors[i].layer_end = layer_end;
+                cv_id = i;
+                break;
+            }
+        }
+
+        if (cv_id == -1) {
+            control_vector_container new_cv;
+            new_cv.path = path;
+            new_cv.scale = scale;
+            new_cv.layer_start = layer_start;
+            new_cv.layer_end = layer_end;
+            new_cv.applied = false;
+
+            // Load the control vector data
+            llama_control_vector_load_info load_info;
+            load_info.fname = path;
+            load_info.strength = 1.0f;  // Don't pre-scale here, we'll scale when applying
+
+            std::vector<llama_control_vector_load_info> load_infos = { load_info };
+            new_cv.data = llama_control_vector_load(load_infos);
+
+            if (new_cv.data.n_embd == -1) {
+                server_task_result result;
+                result.id = task.id;
+                result.error = true;
+                result.data = json{{ "success", false }, { "error", "Failed to load control vector from " + path }};
+                queue_results.send(result);
+                break;
+            }
+
+            // Validate dimension to prevent heap corruption
+            if (new_cv.data.n_embd != llama_model_n_embd(model)) {
+                server_task_result result;
+                result.id = task.id;
+                result.error = true;
+                result.data = json{{ "success", false },
+                                   { "error", "Vector dimension mismatch" }};
+                queue_results.send(result);
+                break;
+            }
+
+            control_vectors.push_back(new_cv);
+
+            cv_id = control_vectors.size() - 1;
+        }
+
+        // Auto-apply control vectors after loading
+        if (!apply_control_vectors_internal()) {
+            server_task_result result;
+            result.id = task.id;
+            result.error = true;
+            result.data = json{{ "success", false }, { "error", "Failed to apply control vectors" }};
+            queue_results.send(result);
+            break;
+        }
+
+        server_task_result result;
+        result.id = task.id;
+        result.error = false;
+        result.data = json{{ "success", true }, { "id", cv_id }};
+        queue_results.send(result);
+    } break;
+    case SERVER_TASK_TYPE_UNLOAD_CONTROL_VECTOR:
+    {
+        // Validate that "id" field exists and is a number
+        if (!task.data.contains("id") || task.data["id"].is_null() || !task.data["id"].is_number()) {
+            server_task_result result;
+            result.id = task.id;
+            result.error = true;
+            result.data = json{{ "success", false }, { "error", "Missing or invalid 'id' field" }};
+            queue_results.send(result);
+            break;
+        }
+
+        int id = task.data.at("id");
+
+        if (id < 0 || id >= (int)control_vectors.size()) {
+            server_task_result result;
+            result.id = task.id;
+            result.error = true;
+            result.data = json{{ "success", false }, { "error", "Invalid control vector ID" }};
+            queue_results.send(result);
+            break;
+        }
+
+        // Remove the control vector from the list
+        control_vectors.erase(control_vectors.begin() + id);
+
+        // Reapply remaining control vectors
+        if (!apply_control_vectors_internal()) {
+            server_task_result result;
+            result.id = task.id;
+            result.error = true;
+            result.data = json{{ "success", false }, { "error", "Failed to apply control vectors" }};
+            queue_results.send(result);
+            break;
+        }
+
+        server_task_result result;
+        result.id = task.id;
+        result.error = false;
+        result.data = json{{ "success", true }};
+        queue_results.send(result);
+    } break;
+    case SERVER_TASK_TYPE_SET_CONTROL_VECTOR:
+    {
+        if (!apply_control_vectors_internal()) {
+            server_task_result result;
+            result.id = task.id;
+            result.error = true;
+            result.data = json{{ "success", false }, { "error", "Failed to apply control vectors" }};
+            queue_results.send(result);
+            break;
+        }
+
+        server_task_result result;
+        result.id = task.id;
+        result.error = false;
+        result.data = json{{ "success", true }};
+        queue_results.send(result);
+    } break;
    }
 }

+bool server_context::apply_control_vectors_internal() {
+    llama_control_vector_data combined_cv = { -1, {} };
+
+    // Check if we have anything to apply
+    bool any_active = false;
+    for (const auto& cv : control_vectors) {
+        if (cv.scale != 0.0f) {
+            any_active = true;
+            break;
+        }
+    }
+
+    if (!any_active) {
+        // Clear control vectors if nothing is active
+        llama_control_vector_apply(ctx, nullptr, 0, 0, 0, 0);
+        return true;
+    }
+
+    // Aggregate control vectors with scaling
+    for (auto& cv : control_vectors) {
+        if (cv.scale == 0.0f) {
+            cv.applied = false;
+            continue;
+        }
+
+        if (combined_cv.n_embd == -1) {
+            combined_cv.n_embd = cv.data.n_embd;
+            combined_cv.data.resize(cv.data.data.size(), 0.0f);
+        }
+
+        for (size_t i = 0; i < cv.data.data.size(); i++) {
+            combined_cv.data[i] += cv.data.data[i] * cv.scale;
+        }
+        cv.applied = true;
+    }
+
+    // Apply combined control vector
+    if (combined_cv.n_embd != -1 && !combined_cv.data.empty()) {
+        int32_t min_layer_start = INT32_MAX;
+        int32_t max_layer_end = 0;
+
+        for (const auto& cv : control_vectors) {
+            if (cv.scale != 0.0f) {
+                min_layer_start = std::min(min_layer_start, cv.layer_start);
+                max_layer_end = std::max(max_layer_end, cv.layer_end);
+            }
+        }
+
+        int err = llama_control_vector_apply(ctx,
+                                            combined_cv.data.data(),
+                                            combined_cv.data.size(),
+                                            combined_cv.n_embd,
+                                            min_layer_start,
+                                            max_layer_end);
+        return (err == 0);
+    }
+
+    return true;
+}
+
 void server_context::on_finish_multitask(const server_task_multi& multitask) {
    // all subtasks done == multitask is done
    server_task_result result;
--- a/examples/server/server-context.h
+++ b/examples/server/server-context.h
@@ -193,6 +193,7 @@ struct server_context {
    llama_model* model = nullptr;
    llama_context* ctx = nullptr;
    std::vector<llama_lora_adapter_container> lora_adapters;
+    std::vector<control_vector_container> control_vectors;

    gpt_params params_base;

@@ -332,4 +333,7 @@ struct server_context {
    void buffer_and_check_string_ban(server_slot& slot, completion_token_output& result);

    json model_meta() const;
+
+    // Re-aggregates all active vectors and updates the model state
+    bool apply_control_vectors_internal();
 };
--- a/examples/server/server-task.h
+++ b/examples/server/server-task.h
@@ -31,6 +31,9 @@ enum server_task_type {
    SERVER_TASK_TYPE_SLOT_RESTORE,
    SERVER_TASK_TYPE_SLOT_ERASE,
    SERVER_TASK_TYPE_SET_LORA,
+    SERVER_TASK_TYPE_LOAD_CONTROL_VECTOR,
+    SERVER_TASK_TYPE_UNLOAD_CONTROL_VECTOR,
+    SERVER_TASK_TYPE_SET_CONTROL_VECTOR,
 };

 enum oaicompat_type {
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1509,6 +1509,101 @@ int main(int argc, char ** argv) {
        res.status = 200; // HTTP OK
    };

+    // Control vector handlers
+    const auto handle_control_vectors_list = [&](const httplib::Request & req, httplib::Response & res) {
+        json result = json::array();
+        for (size_t i = 0; i < ctx_server.control_vectors.size(); ++i) {
+            auto & cv = ctx_server.control_vectors[i];
+            result.push_back({
+                {"id", i},
+                {"path", cv.path},
+                {"scale", cv.scale},
+                {"layer_start", cv.layer_start},
+                {"layer_end", cv.layer_end},
+                {"applied", cv.applied},
+            });
+        }
+        res.set_content(result.dump(), "application/json");
+        res.status = 200; // HTTP OK
+    };
+
+    const auto handle_control_vectors_load = [&](const httplib::Request & req, httplib::Response & res) {
+        const json body = json::parse(req.body);
+
+        server_task task;
+        task.type = SERVER_TASK_TYPE_LOAD_CONTROL_VECTOR;
+        task.data = body;
+
+        const int id_task = ctx_server.queue_tasks.post(std::move(task));
+        ctx_server.queue_results.add_waiting_task_id(id_task);
+
+        server_task_result result = ctx_server.queue_results.recv(id_task);
+        ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+        res.set_content(result.data.dump(), "application/json");
+        res.status = result.error ? 400 : 200;
+    };
+
+    const auto handle_control_vectors_unload = [&](const httplib::Request & req, httplib::Response & res) {
+        const json body = json::parse(req.body);
+
+        server_task task;
+        task.type = SERVER_TASK_TYPE_UNLOAD_CONTROL_VECTOR;
+        task.data = body;
+
+        const int id_task = ctx_server.queue_tasks.post(std::move(task));
+        ctx_server.queue_results.add_waiting_task_id(id_task);
+
+        server_task_result result = ctx_server.queue_results.recv(id_task);
+        ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+        res.set_content(result.data.dump(), "application/json");
+        res.status = result.error ? 400 : 200;
+    };
+
+    const auto handle_control_vectors_apply = [&](const httplib::Request & req, httplib::Response & res) {
+        const std::vector<json> body = json::parse(req.body);
+        int max_idx = ctx_server.control_vectors.size();
+
+        // Update scales for existing control vectors
+        for (auto & cv : ctx_server.control_vectors) {
+            cv.scale = 0.0f;  // Reset all scales first
+        }
+
+        // Set new scales
+        for (auto entry : body) {
+            int id = entry.at("id");
+            float scale = entry.at("scale");
+            if (0 <= id && id < max_idx) {
+                ctx_server.control_vectors[id].scale = scale;
+
+                // Optionally update layer range
+                if (entry.contains("layer_start")) {
+                    ctx_server.control_vectors[id].layer_start = entry.at("layer_start");
+                }
+                if (entry.contains("layer_end")) {
+                    ctx_server.control_vectors[id].layer_end = entry.at("layer_end");
+                }
+            } else {
+                res.set_content(json{{ "success", false }, { "error", "Invalid control vector id" }}.dump(), "application/json");
+                res.status = 400;
+                return;
+            }
+        }
+
+        server_task task;
+        task.type = SERVER_TASK_TYPE_SET_CONTROL_VECTOR;
+
+        const int id_task = ctx_server.queue_tasks.post(std::move(task));
+        ctx_server.queue_results.add_waiting_task_id(id_task);
+
+        server_task_result result = ctx_server.queue_results.recv(id_task);
+        ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+        res.set_content(result.data.dump(), "application/json");
+        res.status = result.error ? 400 : 200;
+    };
+
    const auto list_saved_prompts = [&ctx_server, &params](const httplib::Request& req, httplib::Response& res) {
        json response = json::array();

@@ -1925,6 +2020,11 @@ int main(int argc, char ** argv) {
    // LoRA adapters hotswap
    svr->Get ("/lora-adapters",       handle_lora_adapters_list);
    svr->Post("/lora-adapters",       handle_lora_adapters_apply);
+    // Control vectors
+    svr->Get ("/control-vectors",       handle_control_vectors_list);
+    svr->Post("/control-vectors/load",   handle_control_vectors_load);
+    svr->Post("/control-vectors/unload", handle_control_vectors_unload);
+    svr->Post("/control-vectors/apply",  handle_control_vectors_apply);
    // Save & load slots
    svr->Get ("/slots",               handle_slots);
    svr->Get ("/slots/list",          list_slot_prompts);
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -434,7 +434,7 @@ int main(int argc, char ** argv) {
            break;
        }

-        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
+        common_sampler_clone(ctx_sampling, drafts[0].ctx_sampling);

        int n_seq_cur  = 1;
        int n_past_cur = n_past_dft;
@@ -503,7 +503,7 @@ int main(int argc, char ** argv) {
                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;

-                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
+                        common_sampler_clone(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);

                        sa.push_back(n_seq_cur);