Merge mainline llama.cpp (#3)

* Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-02-06 06:20:08 +00:00 · 2024-07-27 07:55:01 +02:00
parent 0684c3e9c7
commit 154e0d75fc
612 changed files with 50817 additions and 165936 deletions
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -29,7 +29,9 @@ static void print_usage_information(const char * argv0, FILE * stream) {
    fprintf(stream, "    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+    fprintf(stream, "    --no-parse-special                   do not parse control tokens.\n");
    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
+    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
 }

 static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -194,7 +196,9 @@ int main(int raw_argc, char ** raw_argv) {
    // variables where to put any arguments we see.
    bool printing_ids = false;
    bool no_bos = false;
+    bool no_parse_special = false;
    bool disable_logging = false;
+    bool show_token_count = false;
    const char * model_path = NULL;
    const char * prompt_path = NULL;
    const char * prompt_arg = NULL;
@@ -227,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
        else if (arg == "--no-bos") {
            no_bos = true;
        }
+        else if (arg == "--no-parse-special") {
+            no_parse_special = true;
+        }
        else if (arg == "-p" || arg == "--prompt") {
            if (prompt_set) {
                fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
@@ -249,6 +256,9 @@ int main(int raw_argc, char ** raw_argv) {
        else if (arg == "--log-disable") {
            disable_logging = true;
        }
+        else if (arg == "--show-count") {
+            show_token_count = true;
+        }
        else {
            fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
            return 1;
@@ -354,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {

    const bool model_wants_add_bos = llama_should_add_bos_token(model);
    const bool add_bos = model_wants_add_bos && !no_bos;
+    const bool parse_special = !no_parse_special;

    std::vector<llama_token> tokens;
-    tokens = ::llama_tokenize(model, prompt, add_bos, true);
+    tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);

    if (printing_ids) {
        printf("[");
@@ -384,6 +395,9 @@ int main(int raw_argc, char ** raw_argv) {
        printf("]\n");
    }

+    if (show_token_count) {
+        printf("Total number of tokens: %ld\n", tokens.size());
+    }
    // silence valgrind
    llama_free(ctx);
    llama_free_model(model);