diff --git a/common/common.cpp b/common/common.cpp index 0b890b59..2e7d6312 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -3589,11 +3589,11 @@ struct llama_model * llama_load_model_from_hf( // Batch utils // -void llama_batch_clear(struct llama_batch & batch) { +void common_batch_clear(struct llama_batch & batch) { batch.n_tokens = 0; } -void llama_batch_add( +void common_batch_add( struct llama_batch & batch, llama_token id, llama_pos pos, @@ -3620,10 +3620,10 @@ std::vector llama_tokenize( const std::string & text, bool add_special, bool parse_special) { - return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special); + return common_tokenize(llama_get_model(ctx), text, add_special, parse_special); } -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_model * model, const std::string & text, bool add_special, @@ -3665,7 +3665,7 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { +std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); @@ -3697,7 +3697,7 @@ std::string llama_token_to_piece(const struct llama_model* model, llama_token to return piece; } -std::string llama_detokenize(const llama_context * ctx, const std::vector & tokens, bool special) { +std::string common_token_to_piece(const llama_context * ctx, const std::vector & tokens, bool special) { std::string text; text.resize(std::max(text.capacity(), tokens.size())); int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); diff --git a/common/common.h b/common/common.h index 569e189c..b67d62d1 100644 --- a/common/common.h +++ b/common/common.h @@ -513,9 +513,9 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector llama_tokenize( bool add_special, bool parse_special = false); -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_model * model, const std::string & text, bool add_special, @@ -548,7 +548,7 @@ std::vector llama_tokenize( // tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` -std::string llama_token_to_piece( +std::string common_token_to_piece( const struct llama_context * ctx, llama_token token, bool special = true); @@ -561,7 +561,7 @@ std::string llama_token_to_piece( // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` // optionally renders special/control tokens -std::string llama_detokenize( +std::string common_token_to_piece( const llama_context * ctx, const std::vector & tokens, bool special = true); diff --git a/common/log.h b/common/log.h index d699ba3b..551c3e8b 100644 --- a/common/log.h +++ b/common/log.h @@ -756,7 +756,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens) first = false; } - auto detokenized = llama_token_to_piece(ctx, token); + auto detokenized = common_token_to_piece(ctx, token); detokenized.erase( std::remove_if( diff --git a/common/sampling.cpp b/common/sampling.cpp index 8aa2222f..ef5cb43a 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -6,7 +6,7 @@ #include using json = nlohmann::ordered_json; -struct llama_sampling_context * llama_sampling_init(const struct llama_vocab* vocab, const struct llama_sampling_params & params) { +struct llama_sampling_context * common_sampler_init(const struct llama_vocab* vocab, const struct llama_sampling_params & params) { struct llama_sampling_context * result = new llama_sampling_context(); result->params = params; @@ -129,7 +129,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_vocab* vo return result; } -void llama_sampling_free(struct llama_sampling_context * ctx) { +void common_sampler_free(struct llama_sampling_context * ctx) { if (ctx->grammar != NULL) { llama_grammar_free(ctx->grammar); } @@ -138,7 +138,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) { delete ctx; } -void llama_sampling_reset(const struct llama_vocab* vocab, llama_sampling_context * ctx) { +void common_sampler_reset(const struct llama_vocab* vocab, llama_sampling_context * ctx) { if (ctx->grammar != NULL) { llama_grammar_free(ctx->grammar); @@ -239,7 +239,7 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama std::string result; for (int i = size - n; i < size; i++) { - result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]); + result += common_token_to_piece(ctx_main, ctx_sampling->prev[i]); } return result; @@ -495,11 +495,11 @@ static llama_token llama_sampling_sample_impl( // for (int i = 0; i < n_top; i++) { // const llama_token id = cur_p.data[i].id; // (void)id; // To avoid a warning that id is unused when logging is disabled. - // LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p); + // LOG(" - %5d: '%12s' (%.3f)\n", id, common_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p); // } //} - //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str()); + //LOG("sampled token: %5d: '%s'\n", id, common_token_to_piece(ctx_main, id).c_str()); } } @@ -519,7 +519,7 @@ static llama_token llama_sampling_sample_impl( // If the token is not valid according to the grammar, perform resampling if (!is_valid) { - LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str()); + LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, common_token_to_piece(ctx_main, id).c_str()); // Restore logits from the copy std::copy(original_logits.begin(), original_logits.end(), logits); @@ -611,7 +611,7 @@ static llama_token_data_array llama_sampling_prepare_impl( return cur_p; } -llama_token llama_sampling_sample( +llama_token common_sampler_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, @@ -630,7 +630,7 @@ llama_token_data_array llama_sampling_prepare( return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits); } -void llama_sampling_accept( +void common_sampler_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, llama_token id, @@ -649,7 +649,7 @@ void llama_sampling_accept( } } -llama_token_data_array * llama_sampling_get_candidates(struct llama_sampling_context * ctx_sampling) { +llama_token_data_array * common_sampler_get_candidates(struct llama_sampling_context * ctx_sampling) { return &ctx_sampling->cur_p; } @@ -659,10 +659,10 @@ std::vector llama_sampling_sample_and_accept_n(struct llama_samplin idxs[i] = i; } - return llama_sampling_sample_and_accept_n(gsmpl, ctx, idxs, draft); + return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft); } -std::vector llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector & idxs, const std::vector & draft) { +std::vector common_sampler_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector & idxs, const std::vector & draft) { GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); std::vector result; @@ -670,9 +670,9 @@ std::vector llama_sampling_sample_and_accept_n(struct llama_samplin size_t i = 0; for (; i < draft.size(); i++) { - const llama_token id = llama_sampling_sample(gsmpl, ctx, nullptr, idxs[i]); + const llama_token id = common_sampler_sample(gsmpl, ctx, nullptr, idxs[i]); - llama_sampling_accept(gsmpl, ctx, id, true); + common_sampler_accept(gsmpl, ctx, id, true); result.push_back(id); @@ -682,9 +682,9 @@ std::vector llama_sampling_sample_and_accept_n(struct llama_samplin } if (i == draft.size()) { - const llama_token id = llama_sampling_sample(gsmpl, ctx, nullptr, idxs[i]); + const llama_token id = common_sampler_sample(gsmpl, ctx, nullptr, idxs[i]); - llama_sampling_accept(gsmpl, ctx, id, true); + common_sampler_accept(gsmpl, ctx, id, true); result.push_back(id); } diff --git a/common/sampling.h b/common/sampling.h index 718dae34..a5420fa7 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -134,14 +134,14 @@ struct llama_sampling_context { // Create a new sampling context instance. -struct llama_sampling_context * llama_sampling_init(const struct llama_vocab* vocab, const struct llama_sampling_params & params); +struct llama_sampling_context * common_sampler_init(const struct llama_vocab* vocab, const struct llama_sampling_params & params); -void llama_sampling_free(struct llama_sampling_context * ctx); +void common_sampler_free(struct llama_sampling_context * ctx); // Reset the sampler context // - clear prev tokens // - reset grammar -void llama_sampling_reset(const struct llama_vocab* vocab, llama_sampling_context * ctx); +void common_sampler_reset(const struct llama_vocab* vocab, llama_sampling_context * ctx); // Set the sampler seed void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed); @@ -169,7 +169,7 @@ std::vector llama_sampling_types_from_chars(const std::strin // this is a common sampling function used across the examples for convenience // it can serve as a starting point for implementing your own sampling function // Note: When using multiple sequences, it is the caller's responsibility to call -// llama_sampling_reset when a sequence ends +// common_sampler_reset when a sequence ends // // required: // - ctx_main: context to use for sampling @@ -183,7 +183,7 @@ std::vector llama_sampling_types_from_chars(const std::strin // - token: sampled token // - candidates: vector of candidate tokens // -llama_token llama_sampling_sample( +llama_token common_sampler_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, @@ -198,7 +198,7 @@ llama_token_data_array llama_sampling_prepare( bool apply_grammar = true, std::vector * original_logits = nullptr); -void llama_sampling_accept( +void common_sampler_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, llama_token id, @@ -206,11 +206,11 @@ void llama_sampling_accept( // returns at least 1 token, up to draft.size() // access the internal list of current candidate tokens -llama_token_data_array * llama_sampling_get_candidates(struct llama_sampling_context * ctx_sampling); +llama_token_data_array * common_sampler_get_candidates(struct llama_sampling_context * ctx_sampling); std::vector llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector & draft); -std::vector llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector & idxs, const std::vector & draft); +std::vector common_sampler_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector & idxs, const std::vector & draft); llama_grammar* llama_sampler_init_llg(const llama_vocab* vocab, const char* grammar_kind, const char* grammar_data); diff --git a/common/speculative.cpp b/common/speculative.cpp index 326a2df3..ff0e167f 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -59,7 +59,7 @@ struct llama_speculative * llama_speculative_init( llama_sampler_type::TOP_K, }; const auto *model_dft = llama_get_model(ctx_dft); - result->smpl = llama_sampling_init(llama_get_model_vocab(model_dft), params); + result->smpl = common_sampler_init(llama_get_model_vocab(model_dft), params); } #endif @@ -74,7 +74,7 @@ void llama_speculative_free(struct llama_speculative * spec) { return; } - llama_sampling_free(spec->smpl); + common_sampler_free(spec->smpl); llama_batch_free(spec->batch); @@ -133,8 +133,8 @@ bool llama_speculative_are_compatible( if (std::strcmp(token_text_tgt, token_text_dft) != 0) { LLAMA_LOG_INFO("%s: draft model vocab must match target model to use speculation but ", __func__); LLAMA_LOG_INFO("token %d content differs - target '%s', draft '%s'\n", i, - llama_token_to_piece(ctx_tgt, i).c_str(), - llama_token_to_piece(ctx_dft, i).c_str()); + common_token_to_piece(ctx_tgt, i).c_str(), + common_token_to_piece(ctx_dft, i).c_str()); return false; } } @@ -201,14 +201,14 @@ std::vector llama_speculative_gen_draft( std::vector prompt_tgt_draft_model; if (!spec->vocab_dft_compatible) { std::string text; - text = llama_detokenize(ctx_tgt, prompt_tgt_main_model, true); + text = common_token_to_piece(ctx_tgt, prompt_tgt_main_model, true); text = replace_to_dft(spec, text); LLAMA_LOG_DEBUG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str()); prompt_tgt_draft_model = llama_tokenize(ctx_dft, text, false, true); // convert id_last to draft vocab std::vector id_last_vec(1, id_last); - text = llama_detokenize(ctx_tgt, id_last_vec); + text = common_token_to_piece(ctx_tgt, id_last_vec); LLAMA_LOG_DEBUG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str()); id_last = llama_tokenize(ctx_dft, text, false, true)[0]; } @@ -272,11 +272,11 @@ std::vector llama_speculative_gen_draft( } // prepare a batch to evaluate any new tokens in the prompt - llama_batch_clear(batch); + common_batch_clear(batch); for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) { //LLAMA_LOG_INFO("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]); - llama_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false); + common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false); prompt_dft.push_back(prompt_tgt[i]); } @@ -292,8 +292,8 @@ std::vector llama_speculative_gen_draft( // LLAMA_LOG_INFO("%s: n_past = %d\n", __func__, n_past); - llama_batch_clear(batch); - llama_batch_add (batch, id_last, n_past, { 0 }, true); + common_batch_clear(batch); + common_batch_add (batch, id_last, n_past, { 0 }, true); prompt_dft.push_back(id_last); @@ -301,25 +301,25 @@ std::vector llama_speculative_gen_draft( llama_decode(ctx_dft, batch); - llama_sampling_reset(llama_get_vocab(ctx_dft), smpl); + common_sampler_reset(llama_get_vocab(ctx_dft), smpl); // sample n_draft tokens from the draft model for (int i = 0; i < params.n_draft; ++i) { - llama_batch_clear(batch); + common_batch_clear(batch); - llama_sampling_sample(smpl, ctx_dft, nullptr, 0); + common_sampler_sample(smpl, ctx_dft, nullptr, 0); - const auto * cur_p = llama_sampling_get_candidates(smpl); + const auto * cur_p = common_sampler_get_candidates(smpl); // for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { // LLAMA_LOG_INFO(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", - // k, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); + // k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); // } // add drafted token for each sequence const llama_token id = cur_p->data[0].id; - llama_sampling_accept(smpl, ctx_dft, id, true); + common_sampler_accept(smpl, ctx_dft, id, true); result.push_back(id); @@ -332,7 +332,7 @@ std::vector llama_speculative_gen_draft( break; } - llama_batch_add(batch, id, n_past + i + 1, { 0 }, true); + common_batch_add(batch, id, n_past + i + 1, { 0 }, true); // evaluate the drafted tokens on the draft model llama_decode(ctx_dft, batch); @@ -341,7 +341,7 @@ std::vector llama_speculative_gen_draft( } if (!spec->vocab_dft_compatible) { - std::string detokenized = llama_detokenize(ctx_dft, result, true); + std::string detokenized = common_token_to_piece(ctx_dft, result, true); detokenized = replace_to_tgt(spec, detokenized); LLAMA_LOG_DEBUG("draft->main detokenized string: '%s'\n", detokenized.c_str()); result = llama_tokenize(ctx_tgt, detokenized, false, true); diff --git a/common/train.cpp b/common/train.cpp index fef1e57c..07c77203 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -955,7 +955,7 @@ size_t tokenize_file( } if (sample_size > 0) { - // llama_tokenize expects zero terminated string, + // common_tokenize expects zero terminated string, // copy sample into buffer and zero terminate it. buf_sample.resize(sample_size); memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size); diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 55f825fe..b33845bf 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -113,7 +113,7 @@ int main(int argc, char ** argv) { // warm up { for (int i = 0; i < 16; ++i) { - llama_batch_add(batch, 0, i, { 0 }, false); + common_batch_add(batch, 0, i, { 0 }, false); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { @@ -144,11 +144,11 @@ int main(int argc, char ** argv) { continue; } - llama_batch_clear(batch); + common_batch_clear(batch); for (int i = 0; i < pp; ++i) { for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { - llama_batch_add(batch, 0, i, { j }, false); + common_batch_add(batch, 0, i, { j }, false); } } batch.logits[batch.n_tokens - 1] = true; @@ -173,10 +173,10 @@ int main(int argc, char ** argv) { const auto t_tg_start = ggml_time_us(); for (int i = 0; i < tg; ++i) { - llama_batch_clear(batch); + common_batch_clear(batch); for (int j = 0; j < pl; ++j) { - llama_batch_add(batch, 0, pp + i, { j }, true); + common_batch_add(batch, 0, pp + i, { j }, true); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 53fbfb0a..d7b57c2c 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -52,7 +52,7 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize(model, params.prompt, true); + tokens_list = ::common_tokenize(model, params.prompt, true); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; @@ -86,7 +86,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + fprintf(stderr, "%s", common_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -102,7 +102,7 @@ int main(int argc, char ** argv) { // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); ++i) { - llama_batch_add(batch, tokens_list[i], i, seq_ids, false); + common_batch_add(batch, tokens_list[i], i, seq_ids, false); } GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); @@ -117,8 +117,8 @@ int main(int argc, char ** argv) { decoder_start_token_id = llama_token_bos(model); } - llama_batch_clear(batch); - llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); + common_batch_clear(batch); + common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); } // llama_decode will output logits only for the last token of the prompt @@ -155,7 +155,7 @@ int main(int argc, char ** argv) { while (n_cur <= n_predict) { // prepare the next batch - llama_batch_clear(batch); + common_batch_clear(batch); // sample the next token for each parallel sequence / stream for (int32_t i = 0; i < n_parallel; ++i) { @@ -201,16 +201,16 @@ int main(int argc, char ** argv) { // if there is only one stream, we print immediately to stdout if (n_parallel == 1) { - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG_TEE("%s", common_token_to_piece(ctx, new_token_id).c_str()); fflush(stdout); } - streams[i] += llama_token_to_piece(ctx, new_token_id); + streams[i] += common_token_to_piece(ctx, new_token_id); i_batch[i] = batch.n_tokens; // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_cur, { i }, true); + common_batch_add(batch, new_token_id, n_cur, { i }, true); n_decode += 1; } diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 2f75df5d..19553809 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -29,7 +29,7 @@ template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); + ret += common_token_to_piece(ctx, *begin); } return ret; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 5e3e8bb2..1a97d234 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -26,7 +26,7 @@ static std::vector split_lines(const std::string & s, const std::st static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { size_t n_tokens = tokens.size(); for (size_t i = 0; i < n_tokens; i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, true); + common_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -166,7 +166,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); for (int j = 0; j < (int) inputs[i].size(); j++) { - fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str()); } fprintf(stderr, "\n\n"); } @@ -206,7 +206,7 @@ int main(int argc, char ** argv) { batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; s = 0; - llama_batch_clear(batch); + common_batch_clear(batch); } // add to batch diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 9c2cfc6e..f51792de 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -14,11 +14,11 @@ static std::vector> encode(llama_context * ctx, const std::ve llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); for (uint64_t i = 0; i < sentences.size(); i++) { - llama_batch_clear(batch); + common_batch_clear(batch); const std::string input_string = instruction + sentences[i]; - std::vector inputs = llama_tokenize(mdl, input_string, true, false); + std::vector inputs = common_tokenize(mdl, input_string, true, false); const int32_t n_toks = inputs.size(); @@ -27,7 +27,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // inputs.push_back(llama_token_eos(mdl)); // we want to ignore instruction tokens for mean pooling - const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size(); + const int32_t n_inst = common_tokenize(mdl, instruction, true, false).size(); #ifdef GRIT_DEBUG // debug tokens - should be matching as referenced in the GritLM sample @@ -39,7 +39,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // add input to batch (this increments n_tokens) for (int32_t j = 0; j < n_toks; j++) { - llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); + common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); } // clear previous kv_cache values (irrelevant for embeddings) @@ -104,14 +104,14 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); - std::vector inputs = llama_tokenize(mdl, prompt, false, true); + std::vector inputs = common_tokenize(mdl, prompt, false, true); int32_t i_current_token = 0; while (true) { - llama_batch_clear(bat); + common_batch_clear(bat); auto n_inputs = (int32_t)inputs.size(); for (int32_t i = 0; i < n_inputs; i++) { - llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); + common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); } inputs.clear(); @@ -130,7 +130,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo break; } - std::string piece = llama_token_to_piece(ctx, token); + std::string piece = common_token_to_piece(ctx, token); if (stream) { std::printf("%s", piece.c_str()); std::fflush(stdout); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index e6d15f29..f98b2aba 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -264,13 +264,13 @@ int main(int argc, char ** argv) { LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > 0) { LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_TEE("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); } LOG_TEE("'\n"); } @@ -349,7 +349,7 @@ int main(int argc, char ** argv) { std::vector embd; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), sparams); + struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), sparams); while (n_remain != 0 || params.interactive) { // predict @@ -421,9 +421,9 @@ int main(int argc, char ** argv) { embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr); + const llama_token id = common_sampler_sample(ctx_sampling, ctx, nullptr); - llama_sampling_accept(ctx_sampling, ctx, id, true); + common_sampler_accept(ctx_sampling, ctx, id, true); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); @@ -444,7 +444,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); + common_sampler_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -456,7 +456,7 @@ int main(int argc, char ** argv) { // display text if (input_echo) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); printf("%s", token_str.c_str()); if (embd.size() > 1) { @@ -479,7 +479,7 @@ int main(int argc, char ** argv) { if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token - printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + printf("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); } fflush(stdout); printf("\n"); @@ -601,7 +601,7 @@ int main(int argc, char ** argv) { for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_ss << llama_token_to_piece(ctx, token); + output_ss << common_token_to_piece(ctx, token); } n_remain -= line_inp.size(); @@ -615,7 +615,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - llama_sampling_reset(llama_get_model_vocab(model), ctx_sampling); + common_sampler_reset(llama_get_model_vocab(model), ctx_sampling); } is_interacting = false; } @@ -634,7 +634,7 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + printf("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); fflush(stdout); } @@ -644,7 +644,7 @@ int main(int argc, char ** argv) { llama_free(ctx); llama_free_model(model); - llama_sampling_free(ctx_sampling); + common_sampler_free(ctx_sampling); llama_backend_free(); #ifndef LOG_DISABLE_LOGS diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index b817be2d..12602257 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -82,7 +82,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + fprintf(stderr, "%s", common_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -118,7 +118,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); // target model sampling context - struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), params.sparams); + struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), params.sparams); // verification n-grams std::vector ngrams_cur(G); @@ -159,12 +159,12 @@ int main(int argc, char ** argv) { // sample first token { - id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0); + id = common_sampler_sample(ctx_sampling, ctx, NULL, 0); - llama_sampling_accept(ctx_sampling, ctx, id, true); + common_sampler_accept(ctx_sampling, ctx, id, true); { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); printf("%s", token_str.c_str()); fflush(stdout); @@ -204,10 +204,10 @@ int main(int argc, char ** argv) { // V V V V V V // id { - llama_batch_clear(batch); + common_batch_clear(batch); // current token - first token of the first level - llama_batch_add(batch, id, n_past, seq_id_all, true); + common_batch_add(batch, id, n_past, seq_id_all, true); // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation { @@ -232,7 +232,7 @@ int main(int argc, char ** argv) { ngrams_cur[g].tokens [j + 1] = t; ngrams_cur[g].i_batch[j + 1] = batch.n_tokens; - llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); + common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); } } } @@ -244,13 +244,13 @@ int main(int argc, char ** argv) { seq_id_look[j] = i + j + 1; } - llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); + common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); } // fill the rest of the levels for (int j = 1; j < N - 1; j++) { for (int i = 0; i < W; i++) { - llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); + common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); } } } @@ -284,13 +284,13 @@ int main(int argc, char ** argv) { } // sample the next token - id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch); + id = common_sampler_sample(ctx_sampling, ctx, NULL, i_batch); - llama_sampling_accept(ctx_sampling, ctx, id, true); + common_sampler_accept(ctx_sampling, ctx, id, true); // print { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); if (v == 0) { printf("%s", token_str.c_str()); @@ -330,7 +330,7 @@ int main(int argc, char ** argv) { // print known n-grams starting with token id (debug) if (0 && v == 0) { if (ngrams_observed.cnt[id] > 0) { - printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); + printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str()); } for (int i = 0; i < ngrams_observed.cnt[id]; i++) { @@ -339,7 +339,7 @@ int main(int argc, char ** argv) { const int idx = id*(N - 1)*G + i*(N - 1); for (int j = 0; j < N - 1; j++) { - const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); + const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); printf("%s", token_str.c_str()); } @@ -361,7 +361,7 @@ int main(int argc, char ** argv) { if (v == 0) { // sample from the last level for (int i = 0; i < W; i++) { - tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i); + tokens_j[N - 2][i] = common_sampler_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i); } } else { for (int i = 0; i < W; i++) { @@ -471,7 +471,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_kv_cache_view_free(&kvc_view); - llama_sampling_free(ctx_sampling); + common_sampler_free(ctx_sampling); llama_batch_free(batch); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 1fff4f74..00bf077c 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -84,7 +84,7 @@ int main(int argc, char ** argv){ fprintf(stderr, "\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + fprintf(stderr, "%s", common_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -106,7 +106,7 @@ int main(int argc, char ** argv){ bool has_eos = false; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), params.sparams); + struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), params.sparams); std::vector draft; @@ -130,11 +130,11 @@ int main(int argc, char ** argv){ int i_dft = 0; while (true) { // sample from the target model - llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft); + llama_token id = common_sampler_sample(ctx_sampling, ctx, NULL, i_dft); - llama_sampling_accept(ctx_sampling, ctx, id, true); + common_sampler_accept(ctx_sampling, ctx, id, true); - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); if (!params.use_color) { printf("%s", token_str.c_str()); @@ -196,8 +196,8 @@ int main(int argc, char ** argv){ // clean the cache of draft tokens that weren't accepted llama_kv_cache_seq_rm(ctx, 0, n_past, -1); - llama_batch_clear(batch_tgt); - llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); + common_batch_clear(batch_tgt); + common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); // Draft already contains a single token sampled from the model: GGML_ASSERT(draft.size() == 1); @@ -207,7 +207,7 @@ int main(int argc, char ** argv){ llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); for (size_t i = 1; i < draft.size(); ++i) { - llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); + common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); } t_draft_us += ggml_time_us() - t_start_draft_us; @@ -243,7 +243,7 @@ int main(int argc, char ** argv){ LOG_TEE("\ntarget:\n"); llama_print_timings(ctx); - llama_sampling_free(ctx_sampling); + common_sampler_free(ctx_sampling); llama_batch_free(batch_tgt); llama_free(ctx); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index d8f295f2..6797eef5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -402,7 +402,7 @@ int main(int argc, char ** argv) { LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); } if (ctx_guidance) { @@ -410,14 +410,14 @@ int main(int argc, char ** argv) { LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); for (int i = 0; i < (int) guidance_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", guidance_inp[i], common_token_to_piece(ctx, guidance_inp[i]).c_str()); } } if (params.n_keep > add_bos) { LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_TEE("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); } LOG_TEE("'\n"); } @@ -449,7 +449,7 @@ int main(int argc, char ** argv) { if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -464,7 +464,7 @@ int main(int argc, char ** argv) { if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -474,7 +474,7 @@ int main(int argc, char ** argv) { if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -549,7 +549,7 @@ int main(int argc, char ** argv) { antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); } - struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), sparams); + struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), sparams); if (!ctx_sampling) { fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); exit(1); @@ -750,9 +750,9 @@ int main(int argc, char ** argv) { LOG("saved session to %s\n", path_session.c_str()); } - const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); + const llama_token id = common_sampler_sample(ctx_sampling, ctx, ctx_guidance); - llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true); + common_sampler_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); @@ -773,7 +773,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false); + common_sampler_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -785,7 +785,7 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id, params.special); + const std::string token_str = common_token_to_piece(ctx, id, params.special); // Console/Stream Output fprintf(stdout, "%s", token_str.c_str()); @@ -877,7 +877,7 @@ int main(int argc, char ** argv) { // if current token is not EOG, we add it to current assistant message if (params.conversation && !waiting_for_first_input) { auto id = llama_sampling_last(ctx_sampling); - assistant_ss << llama_token_to_piece(ctx, id, false); + assistant_ss << common_token_to_piece(ctx, id, false); } if ((n_past > 0 || waiting_for_first_input) && is_interacting) { @@ -955,7 +955,7 @@ int main(int argc, char ** argv) { for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_ss << llama_token_to_piece(ctx, token); + output_ss << common_token_to_piece(ctx, token); } // reset assistant message @@ -973,7 +973,7 @@ int main(int argc, char ** argv) { if (n_past > 0 || waiting_for_first_input) { if (is_interacting) { - llama_sampling_reset(llama_get_model_vocab(model), ctx_sampling); + common_sampler_reset(llama_get_model_vocab(model), ctx_sampling); } is_interacting = false; waiting_for_first_input = false; @@ -1006,7 +1006,7 @@ int main(int argc, char ** argv) { llama_free(ctx); llama_free_model(model); - llama_sampling_free(ctx_sampling); + common_sampler_free(ctx_sampling); llama_backend_free(); #ifndef LOG_DISABLE_LOGS diff --git a/examples/mtmd/mtmd-cli.cpp b/examples/mtmd/mtmd-cli.cpp index d499cbad..c7a830cd 100644 --- a/examples/mtmd/mtmd-cli.cpp +++ b/examples/mtmd/mtmd-cli.cpp @@ -72,30 +72,12 @@ using common_params = gpt_params; inline common_init_result common_init_from_params(gpt_params & params) { return llama_init_from_gpt_params(params); } -inline llama_sampling_context * common_sampler_init(const llama_model * model, const llama_sampling_params & sparams) { - return llama_sampling_init(llama_get_model_vocab(model), sparams); -} + inline std::vector common_tokenize(const llama_context * ctx, const std::string & text, bool add_special, bool parse_special = false) { return llama_tokenize(ctx, text, add_special, parse_special); } -inline void common_sampler_free(common_sampler * smpl) { - llama_sampling_free(smpl); -} -inline llama_token common_sampler_sample(common_sampler * gsmpl, llama_context * ctx, int idx, [[maybe_unused]] bool grammar_first = false) { - return llama_sampling_sample(gsmpl, ctx, nullptr, idx); -} -inline void common_sampler_accept(common_sampler * gsmpl, llama_context * ctx, llama_token token, bool accept_grammar) { - llama_sampling_accept(gsmpl, ctx, token, accept_grammar); -} -inline std::string common_token_to_piece(const llama_context * ctx, llama_token token, bool special = true) { - return llama_token_to_piece(ctx, token, special); -} -inline void common_batch_clear(llama_batch & batch) { - llama_batch_clear(batch); -} -inline void common_batch_add(llama_batch & batch, llama_token id, llama_pos pos, const std::vector & seq_ids, bool logits) { - llama_batch_add(batch, id, pos, seq_ids, logits); -} + + void common_init() { #ifdef NDEBUG const char * build_type = ""; @@ -143,8 +125,7 @@ struct mtmd_cli_context { mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) { model = llama_init.model; //.get(); lctx = llama_init.context; //.get(); - vocab = llama_model_get_vocab(model); - smpl = common_sampler_init(model, params.sparams); //sampling); + smpl = common_sampler_init(vocab, params.sparams); //sampling); n_threads = params.n_threads; batch = llama_batch_init(1, 0, 1); // batch for next token generation n_batch = params.n_batch; @@ -225,7 +206,7 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { break; } - llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1); + llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, nullptr, -1); generated_tokens.push_back(token_id); common_sampler_accept(ctx.smpl, ctx.lctx, token_id, true); @@ -403,7 +384,7 @@ int main(int argc, char ** argv) { if (line == "/clear") { ctx.n_past = 0; llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); - //llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS + //llama_kv_cache_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS LOG_TEE("Chat history cleared\n\n"); continue; } diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 0d5e9f7e..b96b3081 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -51,7 +51,7 @@ static std::vector k_prompts = { struct client { ~client() { if (ctx_sampling) { - llama_sampling_free(ctx_sampling); + common_sampler_free(ctx_sampling); } } @@ -161,7 +161,7 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), params.sparams); + client.ctx_sampling = common_sampler_init(llama_get_model_vocab(model), params.sparams); } std::vector tokens_system; @@ -190,7 +190,7 @@ int main(int argc, char ** argv) { LOG_TEE("%s: Evaluating the system prompt ...\n", __func__); for (int32_t i = 0; i < n_tokens_system; ++i) { - llama_batch_add(batch, tokens_system[i], i, { 0 }, false); + common_batch_add(batch, tokens_system[i], i, { 0 }, false); } if (llama_decode(ctx, batch) != 0) { @@ -214,7 +214,7 @@ int main(int argc, char ** argv) { llama_kv_cache_dump_view_seqs(kvc_view, 40); } - llama_batch_clear(batch); + common_batch_clear(batch); // decode any currently ongoing sequences for (auto & client : clients) { @@ -224,7 +224,7 @@ int main(int argc, char ** argv) { client.i_batch = batch.n_tokens; - llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); + common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); client.n_decoded += 1; } @@ -253,14 +253,14 @@ int main(int argc, char ** argv) { client.prompt = client.input + "\nAssistant:"; client.response = ""; - llama_sampling_reset(llama_get_model_vocab(model), client.ctx_sampling); + common_sampler_reset(llama_get_model_vocab(model), client.ctx_sampling); // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); for (size_t i = 0; i < tokens_prompt.size(); ++i) { - llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); + common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); } // extract the logits only for the last token @@ -341,9 +341,9 @@ int main(int argc, char ** argv) { //printf("client %d, seq %d, token %d, pos %d, batch %d\n", // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); - const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i); + const llama_token id = common_sampler_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i); - llama_sampling_accept(client.ctx_sampling, ctx, id, true); + common_sampler_accept(client.ctx_sampling, ctx, id, true); if (client.n_decoded == 1) { // start measuring generation time after the first token to make sure all concurrent clients @@ -351,7 +351,7 @@ int main(int argc, char ** argv) { client.t_start_gen = ggml_time_us(); } - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); client.response += token_str; client.sampled = id; diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index d03215cd..c53c3e48 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -133,10 +133,10 @@ int main(int argc, char ** argv) { n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } - llama_batch_clear(batch); + common_batch_clear(batch); for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { - llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); } if (i + n_batch >= n_tokens_all) { @@ -167,10 +167,10 @@ int main(int argc, char ** argv) { n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; - llama_batch_clear(batch); + common_batch_clear(batch); for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { - llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); } if (i + n_batch >= n_tokens_all) { @@ -239,16 +239,16 @@ int main(int argc, char ** argv) { break; } - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG_TEE("%s", common_token_to_piece(ctx, new_token_id).c_str()); fflush(stdout); n_decode += 1; // prepare the next batch - llama_batch_clear(batch); + common_batch_clear(batch); // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_past++, { 0 }, true); + common_batch_add(batch, new_token_id, n_past++, { 0 }, true); } n_cur += 1; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index fc0980d0..5c5d2397 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -869,7 +869,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_cur.seq_tokens[2].size() - hs_cur.common_prefix + hs_cur.seq_tokens[3].size() - hs_cur.common_prefix; - //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size()); + //GGML_ASSERT(hs_cur.common_prefix >= ::common_tokenize(ctx, hs_cur.context, true).size()); // Delete the selected random example from the prompt if (randomize_tasks) { @@ -906,7 +906,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { size_t i1 = i0; size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - llama_batch_clear(batch); + common_batch_clear(batch); // batch as much tasks as possible into the available context // each task has 4 unique sequence ids - one for each ending @@ -922,7 +922,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } for (size_t i = 0; i < hs_cur.common_prefix; ++i) { - llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); + common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix n_logits += 1; @@ -932,7 +932,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // TODO: don't evaluate the last token of each sequence for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) { const bool needs_logits = i < seq_tokens_size - 1; - llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); + common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); n_logits += needs_logits; } } @@ -1191,7 +1191,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { size_t i1 = i0; size_t i_logits = 0; - llama_batch_clear(batch); + common_batch_clear(batch); while (n_cur + (int) data[i1].required_tokens <= n_ctx) { int n_logits = 0; @@ -1201,7 +1201,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { } for (size_t i = 0; i < data[i1].common_prefix; ++i) { - llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); + common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); } batch.logits[batch.n_tokens - 1] = true; n_logits += 1; @@ -1209,7 +1209,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { for (int s = 0; s < 2; ++s) { // TODO: end before the last token, no need to predict past the end of the sequences for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) { - llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); + common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); n_logits += 1; } } @@ -1547,7 +1547,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params size_t i1 = i0; size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - llama_batch_clear(batch); + common_batch_clear(batch); // batch as much tasks as possible into the available context // each task has 4 unique sequence ids - one for each ending @@ -1569,8 +1569,8 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s; for (size_t i = 0; i < cur_task.common_prefix; ++i) { - //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); - llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); + //common_batch_clear(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); + common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix n_logits += 1; @@ -1580,7 +1580,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params // TODO: don't evaluate the last token of each sequence for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) { const bool needs_logits = i < seq_tokens_size - 1; - llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); + common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); n_logits += needs_logits; } } diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index e5f8eda5..eb0b1ecc 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -76,7 +76,7 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { size_t n_tokens = tokens.size(); for (size_t i = 0; i < n_tokens; i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, true); + common_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -204,7 +204,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { - fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); } fprintf(stderr, "\n\n"); } @@ -232,7 +232,7 @@ int main(int argc, char ** argv) { if (batch.n_tokens + n_toks > n_batch) { float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd); - llama_batch_clear(batch); + common_batch_clear(batch); p += s; s = 0; } @@ -266,7 +266,7 @@ int main(int argc, char ** argv) { std::vector query_emb(n_embd, 0); batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd); - llama_batch_clear(query_batch); + common_batch_clear(query_batch); // compute cosine similarities { diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 3ea7c790..73537a5b 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -74,7 +74,7 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx, &candidates_p); - auto next_token_str = llama_token_to_piece(ctx, next_token); + auto next_token_str = common_token_to_piece(ctx, next_token); printf("%s", next_token_str.c_str()); result0 += next_token_str; @@ -133,7 +133,7 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx2, &candidates_p); - auto next_token_str = llama_token_to_piece(ctx2, next_token); + auto next_token_str = common_token_to_piece(ctx2, next_token); printf("%s", next_token_str.c_str()); result1 += next_token_str; @@ -224,7 +224,7 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx3, &candidates_p); - auto next_token_str = llama_token_to_piece(ctx3, next_token); + auto next_token_str = common_token_to_piece(ctx3, next_token); printf("%s", next_token_str.c_str()); result2 += next_token_str; diff --git a/examples/server/server-common.cpp b/examples/server/server-common.cpp index 950c65c0..440b8760 100644 --- a/examples/server/server-common.cpp +++ b/examples/server/server-common.cpp @@ -211,12 +211,12 @@ size_t validate_utf8(const std::string& text) { return len; } -// TODO: reuse llama_detokenize +// TODO: reuse common_token_to_piece template static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); + ret += common_token_to_piece(ctx, *begin); } return ret; @@ -228,7 +228,7 @@ std::string tokens_to_str(llama_context* ctx, const llama_tokens& tokens) { // format incomplete utf-8 multibyte character for output std::string tokens_to_output_formatted_string(const llama_context* ctx, const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); + std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) @@ -372,8 +372,8 @@ common_prefix find_common_text_token_prefix(const llama_context* ctx, const llam llama_tokens a_sub(a.begin() + start, a.end()); llama_tokens b_sub(b.begin() + start, b.end()); - std::string a_str = llama_detokenize(ctx, a_sub, true); - std::string b_str = llama_detokenize(ctx, b_sub, true); + std::string a_str = common_token_to_piece(ctx, a_sub, true); + std::string b_str = common_token_to_piece(ctx, b_sub, true); common_prefix string_prefix; std::vector a_list; @@ -1722,7 +1722,7 @@ server_tokens::server_tokens(const llama_tokens& tokens, bool has_mtmd) : has_mt text_tokens.push_back(t); } } - return llama_detokenize(ctx, text_tokens, special); + return common_token_to_piece(ctx, text_tokens, special); } std::string server_tokens::detokenize(const llama_context* ctx, bool special, size_t start, size_t length) const { @@ -1744,7 +1744,7 @@ server_tokens::server_tokens(const llama_tokens& tokens, bool has_mtmd) : has_mt } ++i; } - return llama_detokenize(ctx, text_tokens, special); + return common_token_to_piece(ctx, text_tokens, special); } size_t server_tokens::find_n_from_tokens(const llama_context* ctx, const server_tokens& b, bool special, @@ -1812,7 +1812,7 @@ server_tokens::server_tokens(const llama_tokens& tokens, bool has_mtmd) : has_mt std::string endStr = think_token.end; llama_tokens tokens = get_text_tokens(); - std::string str = llama_detokenize(ctx, tokens, true); + std::string str = common_token_to_piece(ctx, tokens, true); std::vector> results; // Find all positions of start and end diff --git a/examples/server/server-common.h b/examples/server/server-common.h index 95cba283..52d1e5b3 100644 --- a/examples/server/server-common.h +++ b/examples/server/server-common.h @@ -164,7 +164,7 @@ size_t common_part(const std::string& a, const std::string& b); // if validate_utf8(text) == text.size(), then the whole text is valid utf8 size_t validate_utf8(const std::string& text); -// TODO: reuse llama_detokenize +// TODO: reuse common_token_to_piece std::string tokens_to_str(llama_context* ctx, const llama_tokens& tokens); diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index 90129cf7..3c4ff874 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -37,7 +37,7 @@ server_context::~server_context() { // Clear any sampling context for (server_slot& slot : slots) { if (slot.ctx_sampling != nullptr) { - llama_sampling_free(slot.ctx_sampling); + common_sampler_free(slot.ctx_sampling); } if (slot.ctx_dft) { llama_free(slot.ctx_dft); @@ -52,16 +52,16 @@ server_context::~server_context() { } bool server_context::load_model(const gpt_params& params_) { - params = params_; + params_base = params_; - llama_init_result llama_init = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params_base); model = llama_init.model; ctx = llama_init.context; lora_adapters = llama_init.lora_adapters; if (model == nullptr) { - LOG_ERROR("unable to load model", { {"model", params.model} }); + LOG_ERROR("unable to load model", { {"model", params_base.model} }); return false; } @@ -70,26 +70,26 @@ bool server_context::load_model(const gpt_params& params_) { add_bos_token = llama_should_add_bos_token(model); has_eos_token = llama_add_eos_token(model) != 1; - chat_templates = common_chat_templates_init(model, params.chat_template); + chat_templates = common_chat_templates_init(model, params_base.chat_template); try { - common_chat_format_example(chat_templates.get(), params.use_jinja, {}); + common_chat_format_example(chat_templates.get(), params_base.use_jinja, {}); } catch (const std::exception& e) { LOG_WARNING("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); chat_templates = common_chat_templates_init(model, "chatml"); } - bool has_draft_model = !params.model_draft.empty() || !params.draft_params.empty(); - std::string& mmproj_path = params.mmproj.path; + bool has_draft_model = !params_base.model_draft.empty() || !params_base.draft_params.empty(); + std::string& mmproj_path = params_base.mmproj.path; if (!mmproj_path.empty()) { mtmd_context_params mparams = mtmd_context_params_default(); - mparams.use_gpu = params.mmproj_use_gpu; + mparams.use_gpu = params_base.mmproj_use_gpu; mparams.print_timings = false; - mparams.n_threads = params.n_threads; - mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED; - mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO; - mparams.image_min_tokens = params.image_min_tokens; - mparams.image_max_tokens = params.image_max_tokens; + mparams.n_threads = params_base.n_threads; + mparams.flash_attn_type = params_base.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED; + mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO; + mparams.image_min_tokens = params_base.image_min_tokens; + mparams.image_max_tokens = params_base.image_max_tokens; mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams); if (mctx == nullptr) { LOG_ERROR("failed to load multimodal model, '%s'\n", mmproj_path.c_str()); @@ -97,8 +97,8 @@ bool server_context::load_model(const gpt_params& params_) { } LOG_INFO("loaded multimodal model, '%s'\n", mmproj_path.c_str()); - if (params.ctx_shift) { - params.ctx_shift = false; + if (params_base.ctx_shift) { + params_base.ctx_shift = false; LOG_WARNING("%s\n", "ctx_shift is not supported by multimodal, it will be disabled"); } @@ -117,15 +117,15 @@ bool server_context::load_model(const gpt_params& params_) { LLAMA_LOG_INFO("\n\n==================================loading DRAFT model==================================\n\n"); gpt_params params_dft; - params_dft.devices = params.devices_draft; - params_dft.model = params.model_draft; - params_dft.n_gpu_layers = params.n_gpu_layers_draft; - params_dft.rpc_servers = params.rpc_servers; - params_dft.cache_type_k = params.cache_type_k_draft.empty() ? params.cache_type_k : params.cache_type_k_draft; - params_dft.cache_type_v = params.cache_type_v_draft.empty() ? params.cache_type_v : params.cache_type_v_draft; - params_dft.flash_attn = params.flash_attn; - if (!params.draft_params.empty()) { - auto [argc, argv] = parse_command_line("llama-server " + params.draft_params); + params_dft.devices = params_base.devices_draft; + params_dft.model = params_base.model_draft; + params_dft.n_gpu_layers = params_base.n_gpu_layers_draft; + params_dft.rpc_servers = params_base.rpc_servers; + params_dft.cache_type_k = params_base.cache_type_k_draft.empty() ? params_base.cache_type_k : params_base.cache_type_k_draft; + params_dft.cache_type_v = params_base.cache_type_v_draft.empty() ? params_base.cache_type_v : params_base.cache_type_v_draft; + params_dft.flash_attn = params_base.flash_attn; + if (!params_base.draft_params.empty()) { + auto [argc, argv] = parse_command_line("llama-server " + params_base.draft_params); if (!gpt_params_parse(argc, argv, params_dft)) { gpt_params_print_usage(argc, argv, params_dft); free_command_line(argc, argv); @@ -135,16 +135,16 @@ bool server_context::load_model(const gpt_params& params_) { } LOG_INFO("", { {"model", params_dft.model} }); if (params_dft.n_ctx == 0) { - params_dft.n_ctx = params.n_ctx_draft; + params_dft.n_ctx = params_base.n_ctx_draft; } - params_dft.n_ctx = params_dft.n_ctx == 0 ? params.n_ctx / params.n_parallel : params_dft.n_ctx; + params_dft.n_ctx = params_dft.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_dft.n_ctx; params_dft.n_parallel = 1; params_dft.n_batch = params_dft.n_ctx; llama_init_result llama_init_dft = llama_init_from_gpt_params(params_dft); llama_model* model_dft = llama_init_dft.model; if (model_dft == nullptr) { - LOG_ERROR("failed to load draft model", { {"model", params.model_draft} }); + LOG_ERROR("failed to load draft model", { {"model", params_base.model_draft} }); return false; } @@ -163,22 +163,22 @@ bool server_context::load_model(const gpt_params& params_) { } void server_context::init() { - const int32_t n_ctx_slot = n_ctx / params.n_parallel; + const int32_t n_ctx_slot = n_ctx / params_base.n_parallel; - LOG_INFO("initializing slots", { {"n_slots", params.n_parallel} }); + LOG_INFO("initializing slots", { {"n_slots", params_base.n_parallel} }); - for (int i = 0; i < params.n_parallel; i++) { + for (int i = 0; i < params_base.n_parallel; i++) { server_slot slot; slot.id = i; slot.ctx = ctx; slot.n_ctx = n_ctx_slot; - slot.n_predict = params.n_predict; + slot.n_predict = params_base.n_predict; slot.mctx = mctx; slot.cache_tokens.has_mtmd = mctx != nullptr; - slot.params.think_tokens = params.think_tokens; - if (params.think_tokens.exclude) { - SRV_WRN("Exclude reasoning tokens when selecting slot based on similarity: start: %s, end: %s\nuse `--reasoning-tokens none` to disable.\n", params.think_tokens.begin.c_str(), params.think_tokens.end.c_str() ); + slot.params.think_tokens = params_base.think_tokens; + if (params_base.think_tokens.exclude) { + SRV_WRN("Exclude reasoning tokens when selecting slot based on similarity: start: %s, end: %s\nuse `--reasoning-tokens none` to disable.\n", params_base.think_tokens.begin.c_str(), params_base.think_tokens.end.c_str() ); } else { SRV_WRN("%s", "Include reasoning tokens when selecting slot based on similarity\nuse `--reasoning-tokens auto` to exclude reasoning tokens.\n"); @@ -188,8 +188,8 @@ void server_context::init() { {"n_ctx_slot", slot.n_ctx} }); - const int ga_n = params.grp_attn_n; - const int ga_w = params.grp_attn_w; + const int ga_n = params_base.grp_attn_n; + const int ga_w = params_base.grp_attn_w; if (ga_n != 1) { GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT @@ -208,7 +208,7 @@ void server_context::init() { slot.ga_n = ga_n; slot.ga_w = ga_w; - slot.sparams = params.sparams; + slot.sparams = params_base.sparams; // Initialize speculative decoding if a draft model is loaded if (ctx_draft) { @@ -225,7 +225,7 @@ void server_context::init() { LOG_ERROR("failed to create speculator", {}); return; } - for (auto& pair : params.replacements_draft) { + for (auto& pair : params_base.replacements_draft) { llama_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str()); } @@ -245,21 +245,21 @@ void server_context::init() { const int32_t n_batch = llama_n_batch(ctx); // only a single seq_id per token is needed - batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1); + batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); } metrics.init(); - if (params.cache_ram_mib != 0) { - if (params.cache_ram_mib < 0) { + if (params_base.cache_ram_mib != 0) { + if (params_base.cache_ram_mib < 0) { LLAMA_LOG_INFO("prompt cache is enabled, size limit: %s\n", "no limit"); } else { - LLAMA_LOG_INFO("prompt cache is enabled, size limit: %d MiB\n", params.cache_ram_mib); + LLAMA_LOG_INFO("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib); } LLAMA_LOG_INFO("%s", "use `--cache-ram 0` to disable the prompt cache\n"); // only apply ram size limit. No token limit for now. - prompt_cache = std::make_unique(ctx, params.cache_ram_mib, 0); + prompt_cache = std::make_unique(ctx, params_base.cache_ram_mib, 0); } else { LLAMA_LOG_INFO("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n"); @@ -268,14 +268,14 @@ void server_context::init() { // thinking is enabled if: // 1. It's not explicitly disabled (reasoning_budget == 0) // 2. The chat template supports it - const bool enable_thinking = params.use_jinja && params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); + const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); //LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking); oai_parser_opt = { - /* use_jinja */ params.use_jinja, - /* prefill_assistant */ params.prefill_assistant, - /* reasoning_format */ params.reasoning_format, - /* chat_template_kwargs */ params.default_template_kwargs, + /* use_jinja */ params_base.use_jinja, + /* prefill_assistant */ params_base.prefill_assistant, + /* reasoning_format */ params_base.reasoning_format, + /* chat_template_kwargs */ params_base.default_template_kwargs, /* common_chat_templates */ chat_templates.get(), /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, /* allow_audio */ mctx ? mtmd_support_audio(mctx) : false, @@ -500,34 +500,19 @@ size_t server_slot::find_stopping_strings(const std::string& text, const size_t void server_slot::print_timings() const { char buffer[512]; - double t_token = t_prompt_processing / n_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + double t_prompt = t_prompt_processing / n_prompt_tokens_processed; + double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - //snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", - // t_prompt_processing, n_prompt_tokens_processed, - // t_token, n_tokens_second); + double t_gen = t_token_generation / n_decoded; + double n_gen_second = 1e3 / t_token_generation * n_decoded; - //LOG_INFO(buffer, {}); - - double t_token_gen = t_token_generation / n_decoded; - double n_tokens_second_gen = 1e3 / t_token_generation * n_decoded; - - //snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", - // t_token_generation, n_decoded, - // t_token, n_tokens_second); - - //LOG_INFO(buffer, {}); - - //snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - - //LOG_INFO(buffer, {}); SLT_INF(*this, "\n" "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" " total time = %10.2f ms / %5d tokens\n", - t_prompt_processing, n_prompt_tokens_processed, t_token, n_tokens_second, - t_token_generation, n_decoded, t_token_gen, n_tokens_second_gen, + t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, + t_token_generation, n_decoded, t_gen, n_gen_second, t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); if (n_draft_total > 0) { @@ -795,7 +780,7 @@ server_slot* server_context::get_available_slot(const server_task& task) { bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) { slot_params default_params; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - llama_sampling_params default_sparams = params.sparams; + llama_sampling_params default_sparams = params_base.sparams; auto& data = task.data; if (data.count("__oaicompat") != 0) { @@ -848,9 +833,9 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) slot.params.post_sampling_probs = json_value(data, "post_sampling_probs", default_params.post_sampling_probs); // speculative decoding parameters - slot.params.speculative.n_max = json_value(data, "speculative.n_max", params.n_draft); - slot.params.speculative.n_min = json_value(data, "speculative.n_min", params.n_draft_min); - slot.params.speculative.p_min = json_value(data, "speculative.p_min", params.p_draft_min); + slot.params.speculative.n_max = json_value(data, "speculative.n_max", params_base.n_draft); + slot.params.speculative.n_min = json_value(data, "speculative.n_min", params_base.n_draft_min); + slot.params.speculative.p_min = json_value(data, "speculative.p_min", params_base.p_draft_min); // Clamp speculative parameters slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min); @@ -945,7 +930,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) if (penalty_prompt != data.end()) { if (penalty_prompt->is_string()) { const auto penalty_prompt_string = penalty_prompt->get(); - slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false); + slot.sparams.penalty_prompt_tokens = common_tokenize(model, penalty_prompt_string, false); if (slot.params.n_predict > 0) { slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict); @@ -988,7 +973,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) else { slot.params.oaicompat_chat_syntax.format = default_params.oaicompat_chat_syntax.format; } - common_reasoning_format reasoning_format = params.reasoning_format; + common_reasoning_format reasoning_format = params_base.reasoning_format; if (data.contains("reasoning_format")) { reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get()); } @@ -1003,7 +988,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) const auto preserved_tokens = data.find("preserved_tokens"); if (preserved_tokens != data.end()) { for (const auto& t : *preserved_tokens) { - auto ids = llama_tokenize(model, t.get(), /* add_special= */ false, /* parse_special= */ true); + auto ids = common_tokenize(model, t.get(), /* add_special= */ false, /* parse_special= */ true); if (ids.size() == 1) { LOG("Preserved token: %d\n", ids[0]); slot.sparams.preserved_tokens.insert(ids[0]); @@ -1020,7 +1005,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) server_grammar_trigger ct(t); if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { const auto& word = ct.value.value; - auto ids = llama_tokenize(model, word, /* add_special= */ false, /* parse_special= */ true); + auto ids = common_tokenize(model, word, /* add_special= */ false, /* parse_special= */ true); if (ids.size() == 1) { auto token = ids[0]; if (std::find(slot.sparams.preserved_tokens.begin(), slot.sparams.preserved_tokens.end(), (llama_token)token) == slot.sparams.preserved_tokens.end()) { @@ -1085,7 +1070,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) } } else if (el[0].is_string()) { - auto toks = llama_tokenize(model, el[0].get(), false); + auto toks = common_tokenize(model, el[0].get(), false); for (auto tok : toks) { slot.sparams.logit_bias[tok] = bias; } @@ -1128,9 +1113,9 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) { if (slot.ctx_sampling != nullptr) { - llama_sampling_free(slot.ctx_sampling); + common_sampler_free(slot.ctx_sampling); } - slot.ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), slot.sparams); + slot.ctx_sampling = common_sampler_init(llama_get_model_vocab(model), slot.sparams); if (slot.ctx_sampling == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); @@ -1174,10 +1159,10 @@ void server_context::system_prompt_update() { for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) { const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i); - llama_batch_clear(batch); + common_batch_clear(batch); for (int32_t j = 0; j < n_tokens; ++j) { - llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false); + common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false); } if (llama_decode(ctx, batch) != 0) { @@ -1187,7 +1172,7 @@ void server_context::system_prompt_update() { } // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i <= params.n_parallel; ++i) { + for (int32_t i = 1; i <= params_base.n_parallel; ++i) { llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } } @@ -1268,7 +1253,7 @@ bool server_context::process_token(completion_token_output& result, server_slot& } // check the limits - if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) { + if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { slot.stopped_limit = true; slot.has_next_token = false; @@ -1297,7 +1282,7 @@ bool server_context::process_token(completion_token_output& result, server_slot& { "slot.n_prompt_tokens", slot.n_prompt_tokens }, { "slot.n_decoded", slot.n_decoded }, { "slot.n_predict", slot.n_predict }, - { "n_slots", params.n_parallel }, + { "n_slots", params_base.n_parallel }, { "slot.n_ctx", slot.n_ctx }, { "n_ctx", n_ctx }, { "n_ctx_train", n_ctx_train }, @@ -1330,7 +1315,7 @@ void server_context::populate_token_probs(const server_slot& slot, completion_to size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); if (post_sampling) { - const auto* cur_p = llama_sampling_get_candidates(slot.ctx_sampling); + const auto* cur_p = common_sampler_get_candidates(slot.ctx_sampling); const size_t max_probs = cur_p->size; // set probability for sampled token @@ -1346,7 +1331,7 @@ void server_context::populate_token_probs(const server_slot& slot, completion_to for (size_t i = 0; i < std::min(max_probs, n_probs); i++) { result.probs.push_back({ cur_p->data[i].id, - llama_detokenize(ctx, {cur_p->data[i].id}, special), + common_token_to_piece(ctx, {cur_p->data[i].id}, special), cur_p->data[i].p }); } @@ -1362,7 +1347,7 @@ void server_context::populate_token_probs(const server_slot& slot, completion_to for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { result.probs.push_back({ cur[i].id, - llama_detokenize(ctx, {cur[i].id}, special), + common_token_to_piece(ctx, {cur[i].id}, special), cur[i].p }); } @@ -1387,7 +1372,7 @@ json server_context::get_formated_generation(const server_slot& slot) const { return json{ {"n_ctx", slot.n_ctx}, {"n_predict", slot.n_predict}, // Server configured n_predict - {"model", params.model_alias}, + {"model", params_base.model_alias}, {"seed", slot.sparams.seed}, {"temperature", slot.sparams.temp}, {"dynatemp_range", slot.sparams.dynatemp_range}, @@ -1548,7 +1533,7 @@ void server_context::send_final_response(server_slot& slot) { {"generated_text", slot.generated_text}, // Always include full text for finish_reason logic {"id_slot", slot.id}, {"stop", true}, - {"model", params.model_alias}, + {"model", params_base.model_alias}, {"tokens_predicted", slot.n_decoded}, {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, @@ -2067,12 +2052,8 @@ void server_context::context_shift_prompt(llama_context* ctx, server_slot& slot, slot.n_prompt_tokens = slot.prompt_tokens.size(); } -void server_context::update_slots() { - if (system_need_update) { - system_prompt_update(); - } - - // release slots +void server_context::release_slots() +{ for (auto& slot : slots) { if (slot.command == SLOT_COMMAND_RELEASE) { slot.state = SLOT_STATE_IDLE; @@ -2092,11 +2073,10 @@ void server_context::update_slots() { queue_tasks.notify_slot_changed(); } } +} - // check if all slots are idle - { +bool server_context::slots_idle(){ bool all_idle = true; - for (auto& slot : slots) { if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) { all_idle = false; @@ -2109,27 +2089,16 @@ void server_context::update_slots() { if (system_prompt.empty() && clean_kv_cache) { kv_cache_clear(); } - - return; + all_idle = true; } - } + return all_idle; +} - { - LOG_VERBOSE("posting NEXT_RESPONSE", {}); - - server_task task; - task.type = SERVER_TASK_TYPE_NEXT_RESPONSE; - task.id_target = -1; - - queue_tasks.post(std::move(task)); - } - - // apply context-shift if needed - // TODO: simplify and improve +void server_context::context_shift() { for (server_slot& slot : slots) { if (slot.ga_n == 1) { if (slot.is_processing() && (int)system_tokens.size() + slot.n_past >= slot.n_ctx - 1) { - if (!params.ctx_shift) { + if (!params_base.ctx_shift) { // this check is redundant (for good) // we should never get here, because generation should already stopped in process_token() send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); @@ -2176,15 +2145,9 @@ void server_context::update_slots() { } } } +} - // start populating the batch for this iteration - llama_batch_clear(batch); - - auto accept_special_token = [&](server_slot& slot, llama_token token) { - return params.special || slot.sparams.preserved_tokens.find(token) != slot.sparams.preserved_tokens.end(); - }; - - // frist, add sampled tokens from any ongoing sequences +void server_context::add_sampled_tokens() { for (auto& slot : slots) { if (slot.state == SLOT_STATE_IDLE) { continue; @@ -2209,7 +2172,7 @@ void server_context::update_slots() { // add the sampled token to the batch slot.i_batch_dft.push_back(batch.n_tokens); - llama_batch_add(batch, slot.sampled, slot.cache_tokens.pos_next(), { slot.id }, true); + common_batch_add(batch, slot.sampled, slot.cache_tokens.pos_next(), { slot.id }, true); slot.cache_tokens.push_back(slot.sampled); if (slot.params.speculative.n_min > (int)draft.size()) { @@ -2226,7 +2189,7 @@ void server_context::update_slots() { // add all drafted tokens to the batch for (size_t i = 0; i < draft.size(); i++) { slot.i_batch_dft.push_back(batch.n_tokens); - llama_batch_add(batch, draft[i], slot.cache_tokens.pos_next(), { slot.id }, true); + common_batch_add(batch, draft[i], slot.cache_tokens.pos_next(), { slot.id }, true); slot.cache_tokens.push_back(draft[i]); } slot.drafted = std::move(draft); @@ -2236,7 +2199,7 @@ void server_context::update_slots() { // no speculative decoding slot.i_batch = batch.n_tokens; - llama_batch_add(batch, slot.sampled, slot.cache_tokens.pos_next(), { slot.id }, true); + common_batch_add(batch, slot.sampled, slot.cache_tokens.pos_next(), { slot.id }, true); slot.cache_tokens.push_back(slot.sampled); @@ -2245,18 +2208,10 @@ void server_context::update_slots() { } slot.n_past = slot.cache_tokens.n_tokens(); } +} - // process in chunks of params.n_batch - int32_t n_batch = llama_n_batch(ctx); - int32_t n_ubatch = llama_n_ubatch(ctx); - - // track if this is an embedding or non-embedding batch - // if we've added sampled tokens above, we are in non-embedding mode - // -1: none, 0: non-embedding, 1: embedding - int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; - - // next, batch any pending prompts without exceeding n_batch - if (params.cont_batching || batch.n_tokens == 0) { +void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t n_batch, int32_t & batch_type) { + if (params_base.cont_batching || batch.n_tokens == 0) { for (auto& slot : slots) { // this slot still has a prompt to be processed if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) { @@ -2275,8 +2230,8 @@ void server_context::update_slots() { if (slot.infill) { const bool add_bos = llama_should_add_bos_token(model); bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); + if (params_base.input_suffix.find_first_of(' ') == 0 && params_base.input_suffix.size() > 1) { + params_base.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } @@ -2291,8 +2246,8 @@ void server_context::update_slots() { prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); - auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; - auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + auto embd_inp = params_base.spm_infill ? suffix_tokens : prefix_tokens; + auto embd_end = params_base.spm_infill ? prefix_tokens : suffix_tokens; if (add_bos) { embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); } @@ -2350,7 +2305,7 @@ void server_context::update_slots() { // if input prompt is too big, truncate it (if group attention self-extend is disabled) // context shift for prompt processing if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { - if (!params.ctx_shift) { + if (!params_base.ctx_shift) { send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_SERVER); slot.release(); continue; @@ -2389,7 +2344,7 @@ void server_context::update_slots() { else { slot.n_discarded_prompt = 0; } - llama_sampling_reset(llama_get_model_vocab(model), slot.ctx_sampling); + common_sampler_reset(llama_get_model_vocab(model), slot.ctx_sampling); if (!slot.params.cache_prompt) { slot.n_past_se = 0; @@ -2424,7 +2379,7 @@ void server_context::update_slots() { // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { - llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false); + common_sampler_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false); } } } @@ -2486,7 +2441,7 @@ void server_context::update_slots() { slot.n_past_se = 0; slot.ga_i = 0; // TODO: is the system prompt ever in the sampling context? - llama_sampling_reset(llama_get_model_vocab(model), slot.ctx_sampling); + common_sampler_reset(llama_get_model_vocab(model), slot.ctx_sampling); } LOG_INFO("kv cache rm [p0, end)", { @@ -2546,7 +2501,7 @@ void server_context::update_slots() { } int p0 = system_tokens.size() + slot.cache_tokens.pos_next(); - llama_batch_add(batch, cur_tok, p0, { slot.id }, slot.embedding); + common_batch_add(batch, cur_tok, p0, { slot.id }, slot.embedding); slot.cache_tokens.push_back(cur_tok); @@ -2571,11 +2526,11 @@ void server_context::update_slots() { GGML_ASSERT(batch.n_tokens > 0); GGML_ASSERT((size_t)slot.n_prompt_tokens == slot.prompt_tokens.size()); - llama_sampling_reset(llama_get_model_vocab(model), slot.ctx_sampling); + common_sampler_reset(llama_get_model_vocab(model), slot.ctx_sampling); for (int i = 0; i < slot.n_prompt_tokens; ++i) { llama_token id = slot.prompt_tokens[i]; if (id != LLAMA_TOKEN_NULL) { - llama_sampling_accept(slot.ctx_sampling, ctx, id, false); + common_sampler_accept(slot.ctx_sampling, ctx, id, false); } } @@ -2599,51 +2554,107 @@ void server_context::update_slots() { } } } +} - if (batch.n_tokens == 0) { - LOG_VERBOSE("no tokens to decode", {}); - return; +void server_context::extend_context(const int32_t n_tokens) { + for (auto& slot : slots) { + if (slot.ga_n != 1) { + // context extension via Self-Extend + // TODO: simplify and/or abstract this + while (slot.n_past_se >= slot.ga_i + slot.ga_w) { + const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; + const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); + const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; + + LOG_TEE("\n"); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); + LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); + llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); + + slot.n_past_se -= bd; + + slot.ga_i += slot.ga_w / slot.ga_n; + + LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); + } + + slot.n_past_se += n_tokens; + } } +} - LOG_VERBOSE("decoding batch", { - {"n_tokens", batch.n_tokens}, - }); +void server_context::speculative_decoding_accept() { + for (auto& slot : slots) { + if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch_dft.empty()) { + continue; + } - // make sure we're in the right embedding mode - llama_set_embeddings(ctx, batch_type == 1); + size_t n_draft = slot.drafted.size(); - // process the created batch of tokens - for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); + // the accepted tokens from the speculation + const auto ids = common_sampler_sample_and_accept_n(slot.ctx_sampling, ctx, slot.i_batch_dft, slot.drafted); + slot.i_batch_dft.clear(); + slot.drafted.clear(); - for (auto& slot : slots) { - if (slot.ga_n != 1) { - // context extension via Self-Extend - // TODO: simplify and/or abstract this - while (slot.n_past_se >= slot.ga_i + slot.ga_w) { - const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; - const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); - const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; + slot.n_past += ids.size(); + slot.n_decoded += ids.size(); + const int64_t t_current = ggml_time_us(); + slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3; - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + // update how many tokens out of those tested were accepted + slot.n_draft_accepted += ids.size() - 1; - llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); - llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); + // rollback to the state before sampling the draft tokens + slot.cache_tokens.keep_first(slot.cache_tokens.n_tokens() - n_draft); + // add accepted tokens to the prompt + slot.cache_tokens.insert({ ids.begin(), ids.end() - 1 }); + slot.sampled = ids.back(); // last accepted token + slot.n_past = slot.cache_tokens.n_tokens(); + llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); - slot.n_past_se -= bd; + for (size_t i = 0; i < ids.size(); ++i) { + completion_token_output result; - slot.ga_i += slot.ga_w / slot.ga_n; + result.tok = ids[i]; + result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); + result.prob = 1.0f; // set later - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); - } + if (slot.sparams.n_probs > 0) { + populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, i); + } - slot.n_past_se += n_tokens; + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + break; } } + SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int)ids.size() - 1, (int)slot.drafted.size(), slot.n_past); + LOG_VERBOSE("speculative decoding result", { + {"id_slot", slot.id}, + {"accepted", (int)ids.size() - 1}, + {"total", (int)slot.drafted.size()}, + {"new_n_past", slot.n_past} + }); + } +} + + +bool server_context::accept_special_token(const server_slot& slot, const llama_token token) { + return params_base.special || slot.sparams.preserved_tokens.find(token) != slot.sparams.preserved_tokens.end(); +}; + +void server_context::process_batch_tokens(int32_t & n_batch) { + for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { + const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); + extend_context(n_tokens); llama_batch batch_view = { n_tokens, @@ -2661,14 +2672,11 @@ void server_context::update_slots() { if (ret != 0) { if (n_batch == 1 || ret < 0) { int user_cancel = -3; - // if you get here, it means the KV cache is full - try increasing it via the context size if (ret == user_cancel) { - LOG_ERROR("Decode process is cancelled by user", { - {"i", i}, - {"n_batch", ret}, - {"ret", ret}, - }); - } else { + LLAMA_LOG_INFO("Decode process is cancelled by user.\n"); + } + else { + // if you get here, it means the KV cache is full - try increasing it via the context size LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", { {"i", i}, {"n_batch", ret}, @@ -2684,12 +2692,9 @@ void server_context::update_slots() { LLAMA_LOG_INFO("n_past = %d\n", (int)slot.cache_tokens.size()); send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); } - } break; // break loop of n_batch } - - // retry with half the batch size to try to find a free slot in the KV cache n_batch /= 2; i -= n_batch; @@ -2703,10 +2708,6 @@ void server_context::update_slots() { continue; // continue loop of n_batch } - // technically, measuring the time here excludes the sampling time for the last batch - // but on the other hand, we don't want to do too many system calls to measure the time, so it's ok - const int64_t t_current = ggml_time_us(); - for (auto& slot : slots) { if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) { continue; // continue loop of slots @@ -2725,9 +2726,9 @@ void server_context::update_slots() { continue; // sample using speculative decoding } const int tok_idx = slot.i_batch - i; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, tok_idx); + const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, NULL, tok_idx); - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + common_sampler_accept(slot.ctx_sampling, ctx, id, true); slot.n_decoded += 1; @@ -2739,15 +2740,14 @@ void server_context::update_slots() { metrics.on_prompt_eval(slot); } - //slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3; slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3; result.tok = id; result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs - result.text_to_send = llama_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); + result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); if (slot.sparams.n_probs > 0) { - populate_token_probs(slot, result, slot.params.post_sampling_probs, params.special, tok_idx); + populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, tok_idx); } if (!process_token(result, slot)) { @@ -2761,64 +2761,67 @@ void server_context::update_slots() { } // speculative decoding - main model sample and accept - for (auto& slot : slots) { - if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch_dft.empty()) { - continue; - } - - size_t n_draft = slot.drafted.size(); - - // the accepted tokens from the speculation - const auto ids = llama_sampling_sample_and_accept_n(slot.ctx_sampling, ctx, slot.i_batch_dft, slot.drafted); - slot.i_batch_dft.clear(); - slot.drafted.clear(); - - slot.n_past += ids.size(); - slot.n_decoded += ids.size(); - - slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3; - - // update how many tokens out of those tested were accepted - slot.n_draft_accepted += ids.size() - 1; - - // rollback to the state before sampling the draft tokens - slot.cache_tokens.keep_first(slot.cache_tokens.n_tokens() - n_draft); - // slot.n_past -= n_draft; - // add accepted tokens to the prompt - slot.cache_tokens.insert({ ids.begin(), ids.end() - 1 }); - slot.sampled = ids.back(); // last accepted token - slot.n_past = slot.cache_tokens.n_tokens(); - llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); - - for (size_t i = 0; i < ids.size(); ++i) { - completion_token_output result; - - result.tok = ids[i]; - result.text_to_send = llama_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); - result.prob = 1.0f; // set later - - if (slot.sparams.n_probs > 0) { - populate_token_probs(slot, result, slot.params.post_sampling_probs, params.special, i); - } - - if (!process_token(result, slot)) { - // release slot because of stop condition - slot.release(); - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); - break; - } - } - SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int)ids.size() - 1, (int)slot.drafted.size(), slot.n_past); - LOG_VERBOSE("speculative decoding result", { - {"id_slot", slot.id}, - {"accepted", (int)ids.size() - 1}, - {"total", (int)slot.drafted.size()}, - {"new_n_past", slot.n_past} - }); - } + speculative_decoding_accept(); } +} + +void server_context::update_slots() { + if (system_need_update) { + system_prompt_update(); + } + // release slots + release_slots(); + + // check if all slots are idle + if (slots_idle()) { + return; + } + + { + LOG_VERBOSE("posting NEXT_RESPONSE", {}); + server_task task; + task.type = SERVER_TASK_TYPE_NEXT_RESPONSE; + task.id_target = -1; + + queue_tasks.post(std::move(task)); + } + + // apply context-shift if needed + // TODO: simplify and improve + context_shift(); + + // start populating the batch for this iteration + common_batch_clear(batch); + + // frist, add sampled tokens from any ongoing sequences + add_sampled_tokens(); + + // process in chunks of params.n_batch + int32_t n_batch = llama_n_batch(ctx); + int32_t n_ubatch = llama_n_ubatch(ctx); + + // track if this is an embedding or non-embedding batch + // if we've added sampled tokens above, we are in non-embedding mode + // -1: none, 0: non-embedding, 1: embedding + int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; + + // next, batch any pending prompts without exceeding n_batch + batch_pending_prompt(n_ubatch, n_batch, batch_type); + + if (batch.n_tokens == 0) { + LOG_VERBOSE("no tokens to decode", {}); + return; + } + + LOG_VERBOSE("decoding batch", { + {"n_tokens", batch.n_tokens}, + }); + + // make sure we're in the right embedding mode + llama_set_embeddings(ctx, batch_type == 1); + + // process the created batch of tokens + process_batch_tokens(n_batch); LOG_VERBOSE("run slots completed", {}); } diff --git a/examples/server/server-context.h b/examples/server/server-context.h index 9e4938f7..34493565 100644 --- a/examples/server/server-context.h +++ b/examples/server/server-context.h @@ -184,7 +184,7 @@ struct server_context { llama_context* ctx = nullptr; std::vector lora_adapters; - gpt_params params; + gpt_params params_base; llama_batch batch; @@ -297,5 +297,23 @@ struct server_context { void update_slots(); + void release_slots(); + + bool slots_idle(); + + void context_shift(); + + void add_sampled_tokens(); + + void batch_pending_prompt(const int32_t n_ubatch, const int32_t n_batch, int32_t & batch_type); + + void process_batch_tokens(int32_t & n_batch); + + void extend_context(const int32_t n_tokens); + + void speculative_decoding_accept(); + + bool accept_special_token(const server_slot& slot, const llama_token token); + json model_meta() const; }; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index baea1bfb..fa792f9e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -593,7 +593,7 @@ int main(int argc, char ** argv) { }); LOG_INFO("chat template", { - {"chat_example", common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params.use_jinja, {}).c_str() + {"chat_example", common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja, {}).c_str() }, {"built_in", params.chat_template.empty()}, }); @@ -990,15 +990,15 @@ int main(int argc, char ** argv) { } json data = { { "system_prompt", ctx_server.system_prompt.c_str() }, - { "model_alias", ctx_server.params.model_alias }, - { "model_path", ctx_server.params.model}, + { "model_alias", ctx_server.params_base.model_alias }, + { "model_path", ctx_server.params_base.model}, { "default_generation_settings", ctx_server.default_generation_settings_for_props }, - { "total_slots", ctx_server.params.n_parallel }, - { "model_name", get_model_name(ctx_server.params.model)}, + { "total_slots", ctx_server.params_base.n_parallel }, + { "model_name", get_model_name(ctx_server.params_base.model)}, { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, - { "bos_token", llama_token_to_piece(ctx_server.ctx, llama_token_bos(ctx_server.model), /* special= */ true)}, - { "eos_token", llama_token_to_piece(ctx_server.ctx, llama_token_eos(ctx_server.model), /* special= */ true)}, - { "model_path", ctx_server.params.model }, + { "bos_token", common_token_to_piece(ctx_server.ctx, llama_token_bos(ctx_server.model), /* special= */ true)}, + { "eos_token", common_token_to_piece(ctx_server.ctx, llama_token_eos(ctx_server.model), /* special= */ true)}, + { "model_path", ctx_server.params_base.model }, { "modalities", json { {"vision", ctx_server.oai_parser_opt.allow_image}, {"audio", ctx_server.oai_parser_opt.allow_audio}, @@ -1007,7 +1007,7 @@ int main(int argc, char ** argv) { }; - if (ctx_server.params.use_jinja) { + if (ctx_server.params_base.use_jinja) { if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) { data["chat_template_tool_use"] = tool_use_src; } @@ -1026,8 +1026,8 @@ int main(int argc, char ** argv) { } } json data = { - { "model_name", get_model_name(ctx_server.params.model)}, - { "model_path", ctx_server.params.model }, + { "model_name", get_model_name(ctx_server.params_base.model)}, + { "model_path", ctx_server.params_base.model }, { "modalities", json { {"vision", ctx_server.oai_parser_opt.allow_image}, {"audio", ctx_server.oai_parser_opt.allow_audio}, @@ -1088,7 +1088,7 @@ int main(int argc, char ** argv) { // OAI-compat task.params.oaicompat = oaicompat; task.params.oaicompat_cmpl_id = completion_id; - task.params.oaicompat_model = get_model_name(ctx_server.params.model); + task.params.oaicompat_model = get_model_name(ctx_server.params_base.model); tasks.push_back(std::move(task)); } @@ -1350,7 +1350,7 @@ int main(int argc, char ** argv) { }; const auto handle_embeddings_impl = [&ctx_server](const httplib::Request& req, httplib::Response& res, oaicompat_type oaicompat) { - if (!ctx_server.params.embedding) { + if (!ctx_server.params_base.embedding) { res_err(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 69a92cf7..09e2cc9b 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -77,7 +77,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + fprintf(stderr, "%s", common_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -89,7 +89,7 @@ int main(int argc, char ** argv) { // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); i++) { - llama_batch_add(batch, tokens_list[i], i, { 0 }, false); + common_batch_add(batch, tokens_list[i], i, { 0 }, false); } // llama_decode will output logits only for the last token of the prompt @@ -132,14 +132,14 @@ int main(int argc, char ** argv) { break; } - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG_TEE("%s", common_token_to_piece(ctx, new_token_id).c_str()); fflush(stdout); // prepare the next batch - llama_batch_clear(batch); + common_batch_clear(batch); // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_cur, { 0 }, true); + common_batch_add(batch, new_token_id, n_cur, { 0 }, true); n_decode += 1; } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index cb88ea1f..8fb1c7d3 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -124,8 +124,8 @@ int main(int argc, char ** argv) { if (std::strcmp(token_text_tgt, token_text_dft) != 0) { fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__); fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i, - llama_token_to_piece(ctx_tgt, i).c_str(), - llama_token_to_piece(ctx_dft, i).c_str()); + common_token_to_piece(ctx_tgt, i).c_str(), + common_token_to_piece(ctx_dft, i).c_str()); return 1; } } @@ -147,7 +147,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str()); + fprintf(stderr, "%s", common_token_to_piece(ctx_tgt, id).c_str()); } fflush(stderr); @@ -180,7 +180,7 @@ int main(int argc, char ** argv) { bool has_eos = false; // target model sampling context - struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model_tgt), params.sparams); + struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model_tgt), params.sparams); // draft sequence data std::vector drafts(n_seq_dft); @@ -191,7 +191,7 @@ int main(int argc, char ** argv) { } for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].ctx_sampling = llama_sampling_init(llama_get_model_vocab(model_dft), params.sparams); + drafts[s].ctx_sampling = common_sampler_init(llama_get_model_vocab(model_dft), params.sparams); } llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); @@ -277,13 +277,13 @@ int main(int argc, char ** argv) { s_keep = s; accept = true; token_id = drafts[s].tokens[i_dft]; - token_str = llama_token_to_piece(ctx_tgt, token_id); - llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); + token_str = common_token_to_piece(ctx_tgt, token_id); + common_sampler_accept(ctx_sampling, ctx_tgt, token_id, true); LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); break; } else { - LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); + LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); drafts[s].active = false; // calculate residual probability @@ -333,8 +333,8 @@ int main(int argc, char ** argv) { // sample from the target model LOG("all drafted tokens were rejected, sampling from residual distribution\n"); token_id = llama_sample_token(ctx_tgt, &dist_tgt); - llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); - token_str = llama_token_to_piece(ctx_tgt, token_id); + common_sampler_accept(ctx_sampling, ctx_tgt, token_id, true); + token_str = common_token_to_piece(ctx_tgt, token_id); } } else { @@ -342,13 +342,13 @@ int main(int argc, char ** argv) { // sample from the target model LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); - token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); + token_id = common_sampler_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); - llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); + common_sampler_accept(ctx_sampling, ctx_tgt, token_id, true); //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str()); - token_str = llama_token_to_piece(ctx_tgt, token_id); + token_str = common_token_to_piece(ctx_tgt, token_id); for (int s = 0; s < n_seq_dft; ++s) { if (!drafts[s].active) { @@ -420,8 +420,8 @@ int main(int argc, char ** argv) { drafts[0].dists.push_back(std::vector()); drafts[0].i_batch_tgt.push_back(0); - llama_batch_clear(batch_dft); - llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); + common_batch_clear(batch_dft); + common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); @@ -447,8 +447,8 @@ int main(int argc, char ** argv) { drafts[0].drafting = true; drafts[0].i_batch_dft = 0; - llama_batch_clear(batch_tgt); - llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); + common_batch_clear(batch_tgt); + common_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); // sample n_draft tokens from the draft model using tree-based sampling for (int i = 0; i < n_draft; ++i) { @@ -463,13 +463,13 @@ int main(int argc, char ** argv) { continue; } - llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft); + common_sampler_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft); const auto & cur_p = drafts[s].ctx_sampling->cur; for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) { LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str()); + k, s, i, cur_p[k].id, cur_p[k].p, common_token_to_piece(ctx_dft, cur_p[k].id).c_str()); } std::vector sa(1, s); @@ -519,7 +519,7 @@ int main(int argc, char ** argv) { const int s = sa[is]; - llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true); + common_sampler_accept(drafts[s].ctx_sampling, ctx_dft, id, true); drafts[s].tokens.push_back(id); // save cur_p.data into drafts[s].dists @@ -528,12 +528,12 @@ int main(int argc, char ** argv) { // add unique drafted tokens to the target batch drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); - llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); + common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); // add the token to the batch for batched decoding with the draft model drafts[s].i_batch_dft = batch_dft.n_tokens; - llama_batch_add(batch_dft, id, n_past_cur, { s }, true); + common_batch_add(batch_dft, id, n_past_cur, { s }, true); if (batch_tgt.n_tokens > n_draft) { drafts[s].drafting = false; @@ -599,9 +599,9 @@ int main(int argc, char ** argv) { LOG_TEE("\ntarget:\n"); llama_print_timings(ctx_tgt); - llama_sampling_free(ctx_sampling); + common_sampler_free(ctx_sampling); for (int s = 0; s < n_seq_dft; ++s) { - llama_sampling_free(drafts[s].ctx_sampling); + common_sampler_free(drafts[s].ctx_sampling); } llama_batch_free(batch_dft); diff --git a/examples/sweep-bench/sweep-bench.cpp b/examples/sweep-bench/sweep-bench.cpp index ef3b3d59..449a0b66 100644 --- a/examples/sweep-bench/sweep-bench.cpp +++ b/examples/sweep-bench/sweep-bench.cpp @@ -108,7 +108,7 @@ int main(int argc, char ** argv) { // warm up if (params.warmup) { - llama_batch_add(batch, bos, 0, { 0 }, false); + common_batch_add(batch, bos, 0, { 0 }, false); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_TEE("%s: llama_decode() failed\n", __func__); @@ -120,10 +120,10 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm(ctx, 0, params.n_ubatch, -1); // prepare batch of pp size for prompt processing performance measurement - llama_batch_clear(batch); + common_batch_clear(batch); for (unsigned int i = 0; i < params.n_ubatch; ++i) { - llama_batch_add(batch, std::rand() % n_vocab, i, { 0 }, false); + common_batch_add(batch, std::rand() % n_vocab, i, { 0 }, false); } if (!decode_helper(ctx, batch, ctx_params.n_ubatch)) { @@ -132,7 +132,7 @@ int main(int argc, char ** argv) { } } - llama_batch_clear(batch); + common_batch_clear(batch); llama_kv_cache_clear(ctx); for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) { @@ -143,8 +143,8 @@ int main(int argc, char ** argv) { const auto t_tg_start = ggml_time_us(); for (unsigned int i = 0; i < tg; ++i) { - llama_batch_clear(batch); - llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true); + common_batch_clear(batch); + common_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_TEE("%s: llama_decode() failed\n", __func__); @@ -158,10 +158,10 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm(ctx, 0, n_kv, -1); // prepare batch of pp size for prompt processing performance measurement - llama_batch_clear(batch); + common_batch_clear(batch); for (unsigned int i = 0; i < pp; ++i) { - llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false); + common_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false); } batch.logits[batch.n_tokens - 1] = true; diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 17f5e496..96aa46e6 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -367,7 +367,7 @@ int main(int raw_argc, char ** raw_argv) { const bool parse_special = !no_parse_special; std::vector tokens; - tokens = ::llama_tokenize(model, prompt, add_bos, parse_special); + tokens = ::common_tokenize(model, prompt, add_bos, parse_special); if (printing_ids) { printf("["); @@ -382,7 +382,7 @@ int main(int raw_argc, char ** raw_argv) { } else { bool invalid_utf8 = false; printf("%6d -> '", tokens[i]); - write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); + write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); if (invalid_utf8) { printf("' (utf-8 decode failure)\n"); } else { diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index db8e3520..9a57a31b 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -614,7 +614,9 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t } GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { +#ifndef NDEBUG printf("%s(%s -> %s)\n", __func__, src->name, dst->name); +#endif if (ggml_backend_buffer_is_cuda(src->buffer)) { ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; diff --git a/include/llama.h b/include/llama.h index 31325cc7..67f46b50 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1125,7 +1125,7 @@ extern "C" { int32_t lstrip, bool special); - /// @details Convert the provided tokens into text (inverse of llama_tokenize()). + /// @details Convert the provided tokens into text (inverse of common_tokenize()). /// @param text The char pointer must be large enough to hold the resulting text. /// @return Returns the number of chars/bytes on success, no more than text_len_max. /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned. diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 80b52d47..64d512c5 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1576,7 +1576,7 @@ struct llama_vocab::impl { std::vector id_to_token; std::vector cache_special_tokens; - std::vector cache_token_to_piece; // llama_token_to_piece(special = true); + std::vector cache_token_to_piece; // common_token_to_piece(special = true); struct pair_hash { size_t operator()(const std::pair & p) const { return std::hash{}(p.first) ^ //create some hash for pair @@ -3639,7 +3639,7 @@ int32_t llama_vocab_token_to_piece( return vocab->token_to_piece(token, buf, length, lstrip, special); } -//int32_t llama_detokenize( +//int32_t common_token_to_piece( // const struct llama_vocab * vocab, // const llama_token * tokens, // int32_t n_tokens, diff --git a/src/unicode.cpp b/src/unicode.cpp index 65f36651..19d8d409 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -802,7 +802,7 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { result.push_back(unicode_cpt_from_utf8(utf8, offset)); } catch (const std::invalid_argument & /*ex*/) { - // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize + // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond common_tokenize ++offset; result.emplace_back(0xFFFD); // replacement character } diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index d3d21331..7ea9e48a 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -199,7 +199,7 @@ int main(int argc, char **argv) { printf("\n"); printf("src: '%s'\n", test_kv.first.c_str()); - printf("res: '%s'\n", llama_detokenize(ctx, res).c_str()); + printf("res: '%s'\n", common_token_to_piece(ctx, res).c_str()); printf("tok: "); for (const auto & tok : res) { printf("%d ", tok); @@ -216,16 +216,16 @@ int main(int argc, char **argv) { if (!correct) { fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, - llama_detokenize(ctx, res).c_str(), - llama_detokenize(ctx, test_kv.second).c_str()); + common_token_to_piece(ctx, res).c_str(), + common_token_to_piece(ctx, test_kv.second).c_str()); fprintf(stderr, "%s : expected tokens: ", __func__); for (const auto & t : test_kv.second) { - fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); + fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str()); } fprintf(stderr, "\n"); fprintf(stderr, "%s : got tokens: ", __func__); for (const auto & t : res) { - fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); + fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str()); } fprintf(stderr, "\n"); @@ -272,7 +272,7 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - //ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector{tok})) << "'" << std::endl; + //ofs << tok << " '" << string_strip(common_token_to_piece(ctx, std::vector{tok})) << "'" << std::endl; ofs << tok << "\n"; } } diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 9498387e..7f55784e 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -78,7 +78,7 @@ int main(int argc, char **argv) { const int n_vocab = llama_n_vocab(model); for (int i = 0; i < n_vocab; ++i) { - std::string str = llama_detokenize(ctx, std::vector(1, i)); + std::string str = common_token_to_piece(ctx, std::vector(1, i)); try { auto cps = unicode_cpts_from_utf8(str); std::vector tokens = llama_tokenize(ctx, str, false, true); @@ -94,7 +94,7 @@ int main(int argc, char **argv) { fprintf(stderr, "]\n"); return 2; } - std::string check = llama_detokenize(ctx, tokens); + std::string check = common_token_to_piece(ctx, tokens); if (check != str) { fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n", __func__, i, str.c_str(), str.length(), check.c_str(), check.length()); @@ -124,7 +124,7 @@ int main(int argc, char **argv) { std::string str = unicode_cpt_to_utf8(cp); std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize(ctx, tokens); + std::string check = common_token_to_piece(ctx, tokens); if (cp != 9601 && str != check) { fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", cp, check.c_str(), check.length(), str.c_str(), str.length()); diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp index 7ca9e2ca..ac47b547 100644 --- a/tests/test-tokenizer-1-spm.cpp +++ b/tests/test-tokenizer-1-spm.cpp @@ -66,9 +66,9 @@ int main(int argc, char ** argv) { const int n_vocab = llama_n_vocab(model); for (int i = 0; i < n_vocab; ++i) { - std::string str = llama_detokenize(ctx, std::vector(1, i), true); + std::string str = common_token_to_piece(ctx, std::vector(1, i), true); std::vector tokens = llama_tokenize(ctx, str, false, true); - std::string check = llama_detokenize(ctx, tokens); + std::string check = common_token_to_piece(ctx, tokens); if (check != str) { fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n", __func__, i, str.c_str(), str.length(), check.c_str(), check.length()); @@ -94,7 +94,7 @@ int main(int argc, char ** argv) { std::string str = unicode_cpt_to_utf8(cp); std::vector tokens = llama_tokenize(ctx, str, false, true); - std::string check = llama_detokenize(ctx, tokens); + std::string check = common_token_to_piece(ctx, tokens); if (cp != 9601 && str != check) { fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", cp, check.c_str(), check.length(), str.c_str(), str.length());