mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-22 15:39:23 +00:00
Server: rename functions and refactor code
rename functions refactor update slots rename params_base rename timings
This commit is contained in:
@@ -3054,7 +3054,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||
if (llama_model_has_decoder(model)) {
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||
}
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_memory_clear(lctx);
|
||||
llama_synchronize(lctx);
|
||||
llama_reset_timings(lctx);
|
||||
}
|
||||
@@ -3564,11 +3564,11 @@ struct llama_model * llama_load_model_from_hf(
|
||||
// Batch utils
|
||||
//
|
||||
|
||||
void llama_batch_clear(struct llama_batch & batch) {
|
||||
void common_batch_clear(struct llama_batch & batch) {
|
||||
batch.n_tokens = 0;
|
||||
}
|
||||
|
||||
void llama_batch_add(
|
||||
void common_batch_add(
|
||||
struct llama_batch & batch,
|
||||
llama_token id,
|
||||
llama_pos pos,
|
||||
@@ -3595,10 +3595,10 @@ std::vector<llama_token> llama_tokenize(
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||
}
|
||||
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
std::vector<llama_token> common_tokenize(
|
||||
const struct llama_model * model,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
@@ -3640,7 +3640,7 @@ std::vector<llama_token> llama_tokenize(
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||
std::string piece;
|
||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
||||
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||
@@ -3672,7 +3672,7 @@ std::string llama_token_to_piece(const struct llama_model* model, llama_token to
|
||||
return piece;
|
||||
}
|
||||
|
||||
std::string llama_detokenize(const llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
||||
std::string common_token_to_piece(const llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
||||
std::string text;
|
||||
text.resize(std::max(text.capacity(), tokens.size()));
|
||||
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
|
||||
@@ -511,9 +511,9 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
|
||||
|
||||
// Batch utils
|
||||
|
||||
void llama_batch_clear(struct llama_batch & batch);
|
||||
void common_batch_clear(struct llama_batch & batch);
|
||||
|
||||
void llama_batch_add(
|
||||
void common_batch_add(
|
||||
struct llama_batch & batch,
|
||||
llama_token id,
|
||||
llama_pos pos,
|
||||
@@ -532,7 +532,7 @@ std::vector<llama_token> llama_tokenize(
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
std::vector<llama_token> common_tokenize(
|
||||
const struct llama_model * model,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
@@ -546,7 +546,7 @@ std::vector<llama_token> llama_tokenize(
|
||||
|
||||
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||
// should work similar to Python's `tokenizer.id_to_piece`
|
||||
std::string llama_token_to_piece(
|
||||
std::string common_token_to_piece(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token,
|
||||
bool special = true);
|
||||
@@ -559,7 +559,7 @@ std::string llama_token_to_piece(
|
||||
// detokenizes a vector of tokens into a string
|
||||
// should work similar to Python's `tokenizer.decode`
|
||||
// optionally renders special/control tokens
|
||||
std::string llama_detokenize(
|
||||
std::string common_token_to_piece(
|
||||
const llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special = true);
|
||||
|
||||
@@ -756,7 +756,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
|
||||
first = false;
|
||||
}
|
||||
|
||||
auto detokenized = llama_token_to_piece(ctx, token);
|
||||
auto detokenized = common_token_to_piece(ctx, token);
|
||||
|
||||
detokenized.erase(
|
||||
std::remove_if(
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <nlohmann/json.hpp>
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
struct llama_sampling_context * llama_sampling_init(const struct llama_vocab* vocab, const struct llama_sampling_params & params) {
|
||||
struct llama_sampling_context * common_sampler_init(const struct llama_vocab* vocab, const struct llama_sampling_params & params) {
|
||||
struct llama_sampling_context * result = new llama_sampling_context();
|
||||
|
||||
result->params = params;
|
||||
@@ -129,7 +129,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_vocab* vo
|
||||
return result;
|
||||
}
|
||||
|
||||
void llama_sampling_free(struct llama_sampling_context * ctx) {
|
||||
void common_sampler_free(struct llama_sampling_context * ctx) {
|
||||
if (ctx->grammar != NULL) {
|
||||
llama_grammar_free(ctx->grammar);
|
||||
}
|
||||
@@ -138,7 +138,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
void llama_sampling_reset(const struct llama_vocab* vocab, llama_sampling_context * ctx) {
|
||||
void common_sampler_reset(const struct llama_vocab* vocab, llama_sampling_context * ctx) {
|
||||
|
||||
if (ctx->grammar != NULL) {
|
||||
llama_grammar_free(ctx->grammar);
|
||||
@@ -239,7 +239,7 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
|
||||
std::string result;
|
||||
|
||||
for (int i = size - n; i < size; i++) {
|
||||
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
|
||||
result += common_token_to_piece(ctx_main, ctx_sampling->prev[i]);
|
||||
}
|
||||
|
||||
return result;
|
||||
@@ -490,11 +490,11 @@ static llama_token llama_sampling_sample_impl(
|
||||
// for (int i = 0; i < n_top; i++) {
|
||||
// const llama_token id = cur_p.data[i].id;
|
||||
// (void)id; // To avoid a warning that id is unused when logging is disabled.
|
||||
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
|
||||
// LOG(" - %5d: '%12s' (%.3f)\n", id, common_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
|
||||
// }
|
||||
//}
|
||||
|
||||
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||
//LOG("sampled token: %5d: '%s'\n", id, common_token_to_piece(ctx_main, id).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -514,7 +514,7 @@ static llama_token llama_sampling_sample_impl(
|
||||
|
||||
// If the token is not valid according to the grammar, perform resampling
|
||||
if (!is_valid) {
|
||||
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, common_token_to_piece(ctx_main, id).c_str());
|
||||
|
||||
// Restore logits from the copy
|
||||
std::copy(original_logits.begin(), original_logits.end(), logits);
|
||||
@@ -606,7 +606,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
||||
return cur_p;
|
||||
}
|
||||
|
||||
llama_token llama_sampling_sample(
|
||||
llama_token common_sampler_sample(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
struct llama_context * ctx_cfg,
|
||||
@@ -625,7 +625,7 @@ llama_token_data_array llama_sampling_prepare(
|
||||
return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
|
||||
}
|
||||
|
||||
void llama_sampling_accept(
|
||||
void common_sampler_accept(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
llama_token id,
|
||||
@@ -644,7 +644,7 @@ void llama_sampling_accept(
|
||||
}
|
||||
}
|
||||
|
||||
llama_token_data_array * llama_sampling_get_candidates(struct llama_sampling_context * ctx_sampling) {
|
||||
llama_token_data_array * common_sampler_get_candidates(struct llama_sampling_context * ctx_sampling) {
|
||||
return &ctx_sampling->cur_p;
|
||||
}
|
||||
|
||||
@@ -654,10 +654,10 @@ std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_samplin
|
||||
idxs[i] = i;
|
||||
}
|
||||
|
||||
return llama_sampling_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
||||
}
|
||||
|
||||
std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const std::vector<llama_token> & draft) {
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const std::vector<llama_token> & draft) {
|
||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||
|
||||
std::vector<llama_token> result;
|
||||
@@ -665,9 +665,9 @@ std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_samplin
|
||||
|
||||
size_t i = 0;
|
||||
for (; i < draft.size(); i++) {
|
||||
const llama_token id = llama_sampling_sample(gsmpl, ctx, nullptr, idxs[i]);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, nullptr, idxs[i]);
|
||||
|
||||
llama_sampling_accept(gsmpl, ctx, id, true);
|
||||
common_sampler_accept(gsmpl, ctx, id, true);
|
||||
|
||||
result.push_back(id);
|
||||
|
||||
@@ -677,9 +677,9 @@ std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_samplin
|
||||
}
|
||||
|
||||
if (i == draft.size()) {
|
||||
const llama_token id = llama_sampling_sample(gsmpl, ctx, nullptr, idxs[i]);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, nullptr, idxs[i]);
|
||||
|
||||
llama_sampling_accept(gsmpl, ctx, id, true);
|
||||
common_sampler_accept(gsmpl, ctx, id, true);
|
||||
|
||||
result.push_back(id);
|
||||
}
|
||||
|
||||
@@ -134,14 +134,14 @@ struct llama_sampling_context {
|
||||
|
||||
|
||||
// Create a new sampling context instance.
|
||||
struct llama_sampling_context * llama_sampling_init(const struct llama_vocab* vocab, const struct llama_sampling_params & params);
|
||||
struct llama_sampling_context * common_sampler_init(const struct llama_vocab* vocab, const struct llama_sampling_params & params);
|
||||
|
||||
void llama_sampling_free(struct llama_sampling_context * ctx);
|
||||
void common_sampler_free(struct llama_sampling_context * ctx);
|
||||
|
||||
// Reset the sampler context
|
||||
// - clear prev tokens
|
||||
// - reset grammar
|
||||
void llama_sampling_reset(const struct llama_vocab* vocab, llama_sampling_context * ctx);
|
||||
void common_sampler_reset(const struct llama_vocab* vocab, llama_sampling_context * ctx);
|
||||
|
||||
// Set the sampler seed
|
||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
|
||||
@@ -169,7 +169,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
|
||||
// this is a common sampling function used across the examples for convenience
|
||||
// it can serve as a starting point for implementing your own sampling function
|
||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||
// llama_sampling_reset when a sequence ends
|
||||
// common_sampler_reset when a sequence ends
|
||||
//
|
||||
// required:
|
||||
// - ctx_main: context to use for sampling
|
||||
@@ -183,7 +183,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
|
||||
// - token: sampled token
|
||||
// - candidates: vector of candidate tokens
|
||||
//
|
||||
llama_token llama_sampling_sample(
|
||||
llama_token common_sampler_sample(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
struct llama_context * ctx_cfg,
|
||||
@@ -198,7 +198,7 @@ llama_token_data_array llama_sampling_prepare(
|
||||
bool apply_grammar = true,
|
||||
std::vector<float> * original_logits = nullptr);
|
||||
|
||||
void llama_sampling_accept(
|
||||
void common_sampler_accept(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
llama_token id,
|
||||
@@ -206,11 +206,11 @@ void llama_sampling_accept(
|
||||
|
||||
// returns at least 1 token, up to draft.size()
|
||||
// access the internal list of current candidate tokens
|
||||
llama_token_data_array * llama_sampling_get_candidates(struct llama_sampling_context * ctx_sampling);
|
||||
llama_token_data_array * common_sampler_get_candidates(struct llama_sampling_context * ctx_sampling);
|
||||
|
||||
std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<llama_token> & draft);
|
||||
|
||||
std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const std::vector<llama_token> & draft);
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const std::vector<llama_token> & draft);
|
||||
|
||||
llama_grammar* llama_sampler_init_llg(const llama_vocab* vocab,
|
||||
const char* grammar_kind, const char* grammar_data);
|
||||
|
||||
@@ -59,7 +59,7 @@ struct llama_speculative * llama_speculative_init(
|
||||
llama_sampler_type::TOP_K,
|
||||
};
|
||||
const auto *model_dft = llama_get_model(ctx_dft);
|
||||
result->smpl = llama_sampling_init(llama_get_model_vocab(model_dft), params);
|
||||
result->smpl = common_sampler_init(llama_get_model_vocab(model_dft), params);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -74,7 +74,7 @@ void llama_speculative_free(struct llama_speculative * spec) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_sampling_free(spec->smpl);
|
||||
common_sampler_free(spec->smpl);
|
||||
|
||||
llama_batch_free(spec->batch);
|
||||
|
||||
@@ -133,8 +133,8 @@ bool llama_speculative_are_compatible(
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
LLAMA_LOG_INFO("%s: draft model vocab must match target model to use speculation but ", __func__);
|
||||
LLAMA_LOG_INFO("token %d content differs - target '%s', draft '%s'\n", i,
|
||||
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||
llama_token_to_piece(ctx_dft, i).c_str());
|
||||
common_token_to_piece(ctx_tgt, i).c_str(),
|
||||
common_token_to_piece(ctx_dft, i).c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -201,14 +201,14 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
std::vector<llama_token> prompt_tgt_draft_model;
|
||||
if (!spec->vocab_dft_compatible) {
|
||||
std::string text;
|
||||
text = llama_detokenize(ctx_tgt, prompt_tgt_main_model, true);
|
||||
text = common_token_to_piece(ctx_tgt, prompt_tgt_main_model, true);
|
||||
text = replace_to_dft(spec, text);
|
||||
LLAMA_LOG_DEBUG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
|
||||
prompt_tgt_draft_model = llama_tokenize(ctx_dft, text, false, true);
|
||||
|
||||
// convert id_last to draft vocab
|
||||
std::vector<llama_token> id_last_vec(1, id_last);
|
||||
text = llama_detokenize(ctx_tgt, id_last_vec);
|
||||
text = common_token_to_piece(ctx_tgt, id_last_vec);
|
||||
LLAMA_LOG_DEBUG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
|
||||
id_last = llama_tokenize(ctx_dft, text, false, true)[0];
|
||||
}
|
||||
@@ -239,7 +239,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
result.reserve(params.n_draft);
|
||||
|
||||
if (reuse_n == 0) {
|
||||
llama_kv_cache_clear(ctx_dft);
|
||||
llama_memory_clear(ctx_dft);
|
||||
|
||||
prompt_dft.clear();
|
||||
} else {
|
||||
@@ -258,25 +258,25 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
llama_kv_cache_seq_rm (ctx_dft, 0, 0, reuse_i);
|
||||
llama_kv_cache_seq_add(ctx_dft, 0, reuse_i, -1, -reuse_i);
|
||||
llama_memory_seq_rm (ctx_dft, 0, 0, reuse_i);
|
||||
llama_memory_seq_add(ctx_dft, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt_dft.size()) {
|
||||
llama_kv_cache_seq_rm (ctx_dft, 0, reuse_n, -1);
|
||||
llama_memory_seq_rm (ctx_dft, 0, reuse_n, -1);
|
||||
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
}
|
||||
}
|
||||
|
||||
// prepare a batch to evaluate any new tokens in the prompt
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
|
||||
//LLAMA_LOG_INFO("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
||||
llama_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
||||
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
||||
|
||||
prompt_dft.push_back(prompt_tgt[i]);
|
||||
}
|
||||
@@ -292,8 +292,8 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
|
||||
// LLAMA_LOG_INFO("%s: n_past = %d\n", __func__, n_past);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
llama_batch_add (batch, id_last, n_past, { 0 }, true);
|
||||
common_batch_clear(batch);
|
||||
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
||||
|
||||
prompt_dft.push_back(id_last);
|
||||
|
||||
@@ -301,25 +301,25 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
|
||||
llama_decode(ctx_dft, batch);
|
||||
|
||||
llama_sampling_reset(llama_get_vocab(ctx_dft), smpl);
|
||||
common_sampler_reset(llama_get_vocab(ctx_dft), smpl);
|
||||
|
||||
// sample n_draft tokens from the draft model
|
||||
for (int i = 0; i < params.n_draft; ++i) {
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
llama_sampling_sample(smpl, ctx_dft, nullptr, 0);
|
||||
common_sampler_sample(smpl, ctx_dft, nullptr, 0);
|
||||
|
||||
const auto * cur_p = llama_sampling_get_candidates(smpl);
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl);
|
||||
|
||||
// for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||
// LLAMA_LOG_INFO(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
// k, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||
// k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||
// }
|
||||
|
||||
// add drafted token for each sequence
|
||||
const llama_token id = cur_p->data[0].id;
|
||||
|
||||
llama_sampling_accept(smpl, ctx_dft, id, true);
|
||||
common_sampler_accept(smpl, ctx_dft, id, true);
|
||||
|
||||
result.push_back(id);
|
||||
|
||||
@@ -332,7 +332,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
break;
|
||||
}
|
||||
|
||||
llama_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||
|
||||
// evaluate the drafted tokens on the draft model
|
||||
llama_decode(ctx_dft, batch);
|
||||
@@ -341,7 +341,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
}
|
||||
|
||||
if (!spec->vocab_dft_compatible) {
|
||||
std::string detokenized = llama_detokenize(ctx_dft, result, true);
|
||||
std::string detokenized = common_token_to_piece(ctx_dft, result, true);
|
||||
detokenized = replace_to_tgt(spec, detokenized);
|
||||
LLAMA_LOG_DEBUG("draft->main detokenized string: '%s'\n", detokenized.c_str());
|
||||
result = llama_tokenize(ctx_tgt, detokenized, false, true);
|
||||
|
||||
@@ -955,7 +955,7 @@ size_t tokenize_file(
|
||||
}
|
||||
|
||||
if (sample_size > 0) {
|
||||
// llama_tokenize expects zero terminated string,
|
||||
// common_tokenize expects zero terminated string,
|
||||
// copy sample into buffer and zero terminate it.
|
||||
buf_sample.resize(sample_size);
|
||||
memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
|
||||
|
||||
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
||||
// warm up
|
||||
{
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
llama_batch_add(batch, 0, i, { 0 }, false);
|
||||
common_batch_add(batch, 0, i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
@@ -144,18 +144,18 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int i = 0; i < pp; ++i) {
|
||||
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
|
||||
llama_batch_add(batch, 0, i, { j }, false);
|
||||
common_batch_add(batch, 0, i, { j }, false);
|
||||
}
|
||||
}
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
@@ -164,7 +164,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (is_pp_shared) {
|
||||
for (int32_t i = 1; i < pl; ++i) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
llama_memory_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -173,10 +173,10 @@ int main(int argc, char ** argv) {
|
||||
const auto t_tg_start = ggml_time_us();
|
||||
|
||||
for (int i = 0; i < tg; ++i) {
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int j = 0; j < pl; ++j) {
|
||||
llama_batch_add(batch, 0, pp + i, { j }, true);
|
||||
common_batch_add(batch, 0, pp + i, { j }, true);
|
||||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
|
||||
@@ -52,7 +52,7 @@ int main(int argc, char ** argv) {
|
||||
// tokenize the prompt
|
||||
|
||||
std::vector<llama_token> tokens_list;
|
||||
tokens_list = ::llama_tokenize(model, params.prompt, true);
|
||||
tokens_list = ::common_tokenize(model, params.prompt, true);
|
||||
|
||||
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
|
||||
|
||||
@@ -86,7 +86,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
fprintf(stderr, "%s", common_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
@@ -102,7 +102,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// evaluate the initial prompt
|
||||
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
||||
llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
|
||||
common_batch_add(batch, tokens_list[i], i, seq_ids, false);
|
||||
}
|
||||
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
||||
|
||||
@@ -117,8 +117,8 @@ int main(int argc, char ** argv) {
|
||||
decoder_start_token_id = llama_token_bos(model);
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
@@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
|
||||
//// assign the system KV cache to all parallel sequences
|
||||
//// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
|
||||
//for (int32_t i = 1; i < n_parallel; ++i) {
|
||||
// llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
// llama_memory_seq_cp(ctx, 0, i, -1, -1);
|
||||
//}
|
||||
|
||||
if (n_parallel > 1) {
|
||||
@@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
while (n_cur <= n_predict) {
|
||||
// prepare the next batch
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// sample the next token for each parallel sequence / stream
|
||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||
@@ -201,16 +201,16 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// if there is only one stream, we print immediately to stdout
|
||||
if (n_parallel == 1) {
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
LOG_TEE("%s", common_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
||||
streams[i] += common_token_to_piece(ctx, new_token_id);
|
||||
|
||||
i_batch[i] = batch.n_tokens;
|
||||
|
||||
// push this new token for next evaluation
|
||||
llama_batch_add(batch, new_token_id, n_cur, { i }, true);
|
||||
common_batch_add(batch, new_token_id, n_cur, { i }, true);
|
||||
|
||||
n_decode += 1;
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ template <class Iter>
|
||||
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||
std::string ret;
|
||||
for (; begin != end; ++begin) {
|
||||
ret += llama_token_to_piece(ctx, *begin);
|
||||
ret += common_token_to_piece(ctx, *begin);
|
||||
}
|
||||
|
||||
return ret;
|
||||
@@ -338,7 +338,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
}
|
||||
|
||||
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
|
||||
@@ -26,7 +26,7 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
|
||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||
size_t n_tokens = tokens.size();
|
||||
for (size_t i = 0; i < n_tokens; i++) {
|
||||
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||
common_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
const struct llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
// run model
|
||||
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
@@ -166,7 +166,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
||||
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
||||
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n\n");
|
||||
}
|
||||
@@ -206,7 +206,7 @@ int main(int argc, char ** argv) {
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
||||
s = 0;
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
}
|
||||
|
||||
// add to batch
|
||||
|
||||
@@ -14,11 +14,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||
|
||||
for (uint64_t i = 0; i < sentences.size(); i++) {
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
const std::string input_string = instruction + sentences[i];
|
||||
|
||||
std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
|
||||
std::vector<llama_token> inputs = common_tokenize(mdl, input_string, true, false);
|
||||
|
||||
const int32_t n_toks = inputs.size();
|
||||
|
||||
@@ -27,7 +27,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
// inputs.push_back(llama_token_eos(mdl));
|
||||
|
||||
// we want to ignore instruction tokens for mean pooling
|
||||
const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
|
||||
const int32_t n_inst = common_tokenize(mdl, instruction, true, false).size();
|
||||
|
||||
#ifdef GRIT_DEBUG
|
||||
// debug tokens - should be matching as referenced in the GritLM sample
|
||||
@@ -39,11 +39,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
|
||||
// add input to batch (this increments n_tokens)
|
||||
for (int32_t j = 0; j < n_toks; j++) {
|
||||
llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
||||
}
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
llama_set_embeddings(ctx, true);
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
@@ -98,20 +98,20 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
||||
const llama_model * mdl = llama_get_model(ctx);
|
||||
llama_token eos_token = llama_token_eos(mdl);
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
llama_set_embeddings(ctx, false);
|
||||
llama_set_causal_attn(ctx, true);
|
||||
|
||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||
|
||||
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
||||
std::vector<llama_token> inputs = common_tokenize(mdl, prompt, false, true);
|
||||
int32_t i_current_token = 0;
|
||||
|
||||
while (true) {
|
||||
llama_batch_clear(bat);
|
||||
common_batch_clear(bat);
|
||||
auto n_inputs = (int32_t)inputs.size();
|
||||
for (int32_t i = 0; i < n_inputs; i++) {
|
||||
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
||||
common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
||||
}
|
||||
inputs.clear();
|
||||
|
||||
@@ -130,7 +130,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
||||
break;
|
||||
}
|
||||
|
||||
std::string piece = llama_token_to_piece(ctx, token);
|
||||
std::string piece = common_token_to_piece(ctx, token);
|
||||
if (stream) {
|
||||
std::printf("%s", piece.c_str());
|
||||
std::fflush(stdout);
|
||||
|
||||
@@ -703,7 +703,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
|
||||
@@ -264,13 +264,13 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
LOG_TEE("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
LOG_TEE("'\n");
|
||||
}
|
||||
@@ -349,7 +349,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), sparams);
|
||||
struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), sparams);
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
@@ -385,8 +385,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
llama_memory_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_memory_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
@@ -421,9 +421,9 @@ int main(int argc, char ** argv) {
|
||||
embd.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
|
||||
const llama_token id = common_sampler_sample(ctx_sampling, ctx, nullptr);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
common_sampler_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
||||
|
||||
@@ -444,7 +444,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
||||
common_sampler_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
@@ -456,7 +456,7 @@ int main(int argc, char ** argv) {
|
||||
// display text
|
||||
if (input_echo) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
printf("%s", token_str.c_str());
|
||||
|
||||
if (embd.size() > 1) {
|
||||
@@ -479,7 +479,7 @@ int main(int argc, char ** argv) {
|
||||
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
|
||||
if (is_interacting && !params.interactive_first) {
|
||||
// print an eot token
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
printf("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n");
|
||||
@@ -601,7 +601,7 @@ int main(int argc, char ** argv) {
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
output_tokens.push_back(token);
|
||||
output_ss << llama_token_to_piece(ctx, token);
|
||||
output_ss << common_token_to_piece(ctx, token);
|
||||
}
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
@@ -615,7 +615,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (n_past > 0) {
|
||||
if (is_interacting) {
|
||||
llama_sampling_reset(llama_get_model_vocab(model), ctx_sampling);
|
||||
common_sampler_reset(llama_get_model_vocab(model), ctx_sampling);
|
||||
}
|
||||
is_interacting = false;
|
||||
}
|
||||
@@ -634,7 +634,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
if (!params.interactive && n_remain <= 0) {
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
printf("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
@@ -644,7 +644,7 @@ int main(int argc, char ** argv) {
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_sampling_free(ctx_sampling);
|
||||
common_sampler_free(ctx_sampling);
|
||||
llama_backend_free();
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
|
||||
@@ -2136,7 +2136,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
test t(inst, lmodel, ctx);
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
// warmup run
|
||||
if (params.warmup) {
|
||||
@@ -2150,7 +2150,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
for (int i = 0; i < params.reps; i++) {
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
uint64_t t_start = get_time_ns();
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
fprintf(stderr, "%s", common_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
@@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
|
||||
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
|
||||
|
||||
for (int s = 1; s < W + G + 1; ++s) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||
llama_memory_seq_cp(ctx, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
@@ -118,7 +118,7 @@ int main(int argc, char ** argv) {
|
||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||
|
||||
// target model sampling context
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), params.sparams);
|
||||
struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), params.sparams);
|
||||
|
||||
// verification n-grams
|
||||
std::vector<ngram_data> ngrams_cur(G);
|
||||
@@ -159,12 +159,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// sample first token
|
||||
{
|
||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
|
||||
id = common_sampler_sample(ctx_sampling, ctx, NULL, 0);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
common_sampler_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
{
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
|
||||
printf("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
@@ -204,10 +204,10 @@ int main(int argc, char ** argv) {
|
||||
// V V V V V V
|
||||
// id
|
||||
{
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// current token - first token of the first level
|
||||
llama_batch_add(batch, id, n_past, seq_id_all, true);
|
||||
common_batch_add(batch, id, n_past, seq_id_all, true);
|
||||
|
||||
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
|
||||
{
|
||||
@@ -232,7 +232,7 @@ int main(int argc, char ** argv) {
|
||||
ngrams_cur[g].tokens [j + 1] = t;
|
||||
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
|
||||
|
||||
llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
|
||||
common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -244,13 +244,13 @@ int main(int argc, char ** argv) {
|
||||
seq_id_look[j] = i + j + 1;
|
||||
}
|
||||
|
||||
llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
|
||||
common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
|
||||
}
|
||||
|
||||
// fill the rest of the levels
|
||||
for (int j = 1; j < N - 1; j++) {
|
||||
for (int i = 0; i < W; i++) {
|
||||
llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
|
||||
common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -284,13 +284,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// sample the next token
|
||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
|
||||
id = common_sampler_sample(ctx_sampling, ctx, NULL, i_batch);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
common_sampler_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
// print
|
||||
{
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
|
||||
if (v == 0) {
|
||||
printf("%s", token_str.c_str());
|
||||
@@ -330,7 +330,7 @@ int main(int argc, char ** argv) {
|
||||
// print known n-grams starting with token id (debug)
|
||||
if (0 && v == 0) {
|
||||
if (ngrams_observed.cnt[id] > 0) {
|
||||
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
||||
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
||||
@@ -339,7 +339,7 @@ int main(int argc, char ** argv) {
|
||||
const int idx = id*(N - 1)*G + i*(N - 1);
|
||||
|
||||
for (int j = 0; j < N - 1; j++) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
||||
const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
||||
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
@@ -361,7 +361,7 @@ int main(int argc, char ** argv) {
|
||||
if (v == 0) {
|
||||
// sample from the last level
|
||||
for (int i = 0; i < W; i++) {
|
||||
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||
tokens_j[N - 2][i] = common_sampler_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < W; i++) {
|
||||
@@ -438,17 +438,17 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// KV cache management
|
||||
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
|
||||
llama_memory_seq_rm(ctx, -1, n_past, -1);
|
||||
|
||||
if (seq_id_best != 0) {
|
||||
// if a verification token matched, we keep the best sequence and remove the rest
|
||||
// this leads to some KV cache fragmentation
|
||||
llama_kv_cache_seq_keep(ctx, seq_id_best);
|
||||
llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
||||
llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1);
|
||||
llama_memory_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
||||
llama_memory_seq_rm (ctx, seq_id_best, -1, -1);
|
||||
|
||||
for (int s = 1; s < W + G + 1; ++s) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||
llama_memory_seq_cp(ctx, 0, s, -1, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -471,7 +471,7 @@ int main(int argc, char ** argv) {
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_kv_cache_view_free(&kvc_view);
|
||||
llama_sampling_free(ctx_sampling);
|
||||
common_sampler_free(ctx_sampling);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
|
||||
@@ -84,7 +84,7 @@ int main(int argc, char ** argv){
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
fprintf(stderr, "%s", common_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
@@ -106,7 +106,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
bool has_eos = false;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), params.sparams);
|
||||
struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), params.sparams);
|
||||
|
||||
std::vector<llama_token> draft;
|
||||
|
||||
@@ -130,11 +130,11 @@ int main(int argc, char ** argv){
|
||||
int i_dft = 0;
|
||||
while (true) {
|
||||
// sample from the target model
|
||||
llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
|
||||
llama_token id = common_sampler_sample(ctx_sampling, ctx, NULL, i_dft);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
common_sampler_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
|
||||
if (!params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
@@ -194,10 +194,10 @@ int main(int argc, char ** argv){
|
||||
|
||||
// KV cache management
|
||||
// clean the cache of draft tokens that weren't accepted
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||
llama_memory_seq_rm(ctx, 0, n_past, -1);
|
||||
|
||||
llama_batch_clear(batch_tgt);
|
||||
llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||
common_batch_clear(batch_tgt);
|
||||
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||
|
||||
// Draft already contains a single token sampled from the model:
|
||||
GGML_ASSERT(draft.size() == 1);
|
||||
@@ -207,7 +207,7 @@ int main(int argc, char ** argv){
|
||||
llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||
|
||||
for (size_t i = 1; i < draft.size(); ++i) {
|
||||
llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||
}
|
||||
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
@@ -243,7 +243,7 @@ int main(int argc, char ** argv){
|
||||
LOG_TEE("\ntarget:\n");
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_sampling_free(ctx_sampling);
|
||||
common_sampler_free(ctx_sampling);
|
||||
llama_batch_free(batch_tgt);
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
@@ -366,7 +366,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
llama_memory_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
}
|
||||
|
||||
LOGLN(
|
||||
@@ -402,7 +402,7 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (ctx_guidance) {
|
||||
@@ -410,14 +410,14 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], common_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (params.n_keep > add_bos) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
LOG_TEE("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
LOG_TEE("'\n");
|
||||
}
|
||||
@@ -449,7 +449,7 @@ int main(int argc, char ** argv) {
|
||||
if (params.verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -464,7 +464,7 @@ int main(int argc, char ** argv) {
|
||||
if (params.verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -474,7 +474,7 @@ int main(int argc, char ** argv) {
|
||||
if (params.verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -549,7 +549,7 @@ int main(int argc, char ** argv) {
|
||||
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
||||
}
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), sparams);
|
||||
struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), sparams);
|
||||
if (!ctx_sampling) {
|
||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
||||
exit(1);
|
||||
@@ -608,8 +608,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
llama_memory_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_memory_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
@@ -636,9 +636,9 @@ int main(int argc, char ** argv) {
|
||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
llama_memory_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_memory_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_memory_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
|
||||
n_past -= bd;
|
||||
|
||||
@@ -750,9 +750,9 @@ int main(int argc, char ** argv) {
|
||||
LOG("saved session to %s\n", path_session.c_str());
|
||||
}
|
||||
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
||||
const llama_token id = common_sampler_sample(ctx_sampling, ctx, ctx_guidance);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
|
||||
common_sampler_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
|
||||
|
||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
||||
|
||||
@@ -773,7 +773,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
|
||||
common_sampler_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
@@ -785,7 +785,7 @@ int main(int argc, char ** argv) {
|
||||
// display text
|
||||
if (input_echo && display) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||
const std::string token_str = common_token_to_piece(ctx, id, params.special);
|
||||
|
||||
// Console/Stream Output
|
||||
fprintf(stdout, "%s", token_str.c_str());
|
||||
@@ -877,7 +877,7 @@ int main(int argc, char ** argv) {
|
||||
// if current token is not EOG, we add it to current assistant message
|
||||
if (params.conversation && !waiting_for_first_input) {
|
||||
auto id = llama_sampling_last(ctx_sampling);
|
||||
assistant_ss << llama_token_to_piece(ctx, id, false);
|
||||
assistant_ss << common_token_to_piece(ctx, id, false);
|
||||
}
|
||||
|
||||
if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
|
||||
@@ -955,7 +955,7 @@ int main(int argc, char ** argv) {
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
output_tokens.push_back(token);
|
||||
output_ss << llama_token_to_piece(ctx, token);
|
||||
output_ss << common_token_to_piece(ctx, token);
|
||||
}
|
||||
|
||||
// reset assistant message
|
||||
@@ -973,7 +973,7 @@ int main(int argc, char ** argv) {
|
||||
if (n_past > 0 || waiting_for_first_input) {
|
||||
if (is_interacting) {
|
||||
|
||||
llama_sampling_reset(llama_get_model_vocab(model), ctx_sampling);
|
||||
common_sampler_reset(llama_get_model_vocab(model), ctx_sampling);
|
||||
}
|
||||
is_interacting = false;
|
||||
waiting_for_first_input = false;
|
||||
@@ -1006,7 +1006,7 @@ int main(int argc, char ** argv) {
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_sampling_free(ctx_sampling);
|
||||
common_sampler_free(ctx_sampling);
|
||||
llama_backend_free();
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
|
||||
@@ -72,30 +72,12 @@ using common_params = gpt_params;
|
||||
inline common_init_result common_init_from_params(gpt_params & params) {
|
||||
return llama_init_from_gpt_params(params);
|
||||
}
|
||||
inline llama_sampling_context * common_sampler_init(const llama_model * model, const llama_sampling_params & sparams) {
|
||||
return llama_sampling_init(llama_get_model_vocab(model), sparams);
|
||||
}
|
||||
|
||||
inline std::vector<llama_token> common_tokenize(const llama_context * ctx, const std::string & text, bool add_special, bool parse_special = false) {
|
||||
return llama_tokenize(ctx, text, add_special, parse_special);
|
||||
}
|
||||
inline void common_sampler_free(common_sampler * smpl) {
|
||||
llama_sampling_free(smpl);
|
||||
}
|
||||
inline llama_token common_sampler_sample(common_sampler * gsmpl, llama_context * ctx, int idx, [[maybe_unused]] bool grammar_first = false) {
|
||||
return llama_sampling_sample(gsmpl, ctx, nullptr, idx);
|
||||
}
|
||||
inline void common_sampler_accept(common_sampler * gsmpl, llama_context * ctx, llama_token token, bool accept_grammar) {
|
||||
llama_sampling_accept(gsmpl, ctx, token, accept_grammar);
|
||||
}
|
||||
inline std::string common_token_to_piece(const llama_context * ctx, llama_token token, bool special = true) {
|
||||
return llama_token_to_piece(ctx, token, special);
|
||||
}
|
||||
inline void common_batch_clear(llama_batch & batch) {
|
||||
llama_batch_clear(batch);
|
||||
}
|
||||
inline void common_batch_add(llama_batch & batch, llama_token id, llama_pos pos, const std::vector<llama_seq_id> & seq_ids, bool logits) {
|
||||
llama_batch_add(batch, id, pos, seq_ids, logits);
|
||||
}
|
||||
|
||||
|
||||
void common_init() {
|
||||
#ifdef NDEBUG
|
||||
const char * build_type = "";
|
||||
@@ -143,8 +125,7 @@ struct mtmd_cli_context {
|
||||
mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
model = llama_init.model; //.get();
|
||||
lctx = llama_init.context; //.get();
|
||||
vocab = llama_model_get_vocab(model);
|
||||
smpl = common_sampler_init(model, params.sparams); //sampling);
|
||||
smpl = common_sampler_init(vocab, params.sparams); //sampling);
|
||||
n_threads = params.n_threads;
|
||||
batch = llama_batch_init(1, 0, 1); // batch for next token generation
|
||||
n_batch = params.n_batch;
|
||||
@@ -225,7 +206,7 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
|
||||
break;
|
||||
}
|
||||
|
||||
llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1);
|
||||
llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, nullptr, -1);
|
||||
generated_tokens.push_back(token_id);
|
||||
common_sampler_accept(ctx.smpl, ctx.lctx, token_id, true);
|
||||
|
||||
@@ -402,7 +383,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
if (line == "/clear") {
|
||||
ctx.n_past = 0;
|
||||
llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1);
|
||||
llama_memory_seq_rm(ctx.lctx, 0, 1, -1);
|
||||
//llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS
|
||||
LOG_TEE("Chat history cleared\n\n");
|
||||
continue;
|
||||
|
||||
@@ -51,7 +51,7 @@ static std::vector<std::string> k_prompts = {
|
||||
struct client {
|
||||
~client() {
|
||||
if (ctx_sampling) {
|
||||
llama_sampling_free(ctx_sampling);
|
||||
common_sampler_free(ctx_sampling);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
|
||||
for (size_t i = 0; i < clients.size(); ++i) {
|
||||
auto & client = clients[i];
|
||||
client.id = i;
|
||||
client.ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), params.sparams);
|
||||
client.ctx_sampling = common_sampler_init(llama_get_model_vocab(model), params.sparams);
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens_system;
|
||||
@@ -190,7 +190,7 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
|
||||
|
||||
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
||||
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
||||
common_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
@@ -200,7 +200,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
for (int32_t i = 1; i <= n_clients; ++i) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
llama_memory_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
@@ -214,7 +214,7 @@ int main(int argc, char ** argv) {
|
||||
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// decode any currently ongoing sequences
|
||||
for (auto & client : clients) {
|
||||
@@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
client.i_batch = batch.n_tokens;
|
||||
|
||||
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
|
||||
common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
|
||||
|
||||
client.n_decoded += 1;
|
||||
}
|
||||
@@ -232,9 +232,9 @@ int main(int argc, char ** argv) {
|
||||
if (batch.n_tokens == 0) {
|
||||
// all sequences have ended - clear the entire KV cache
|
||||
for (int i = 1; i <= n_clients; ++i) {
|
||||
llama_kv_cache_seq_rm(ctx, i, -1, -1);
|
||||
llama_memory_seq_rm(ctx, i, -1, -1);
|
||||
// but keep the system prompt
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
llama_memory_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: clearing the KV cache\n", __func__);
|
||||
@@ -253,14 +253,14 @@ int main(int argc, char ** argv) {
|
||||
client.prompt = client.input + "\nAssistant:";
|
||||
client.response = "";
|
||||
|
||||
llama_sampling_reset(llama_get_model_vocab(model), client.ctx_sampling);
|
||||
common_sampler_reset(llama_get_model_vocab(model), client.ctx_sampling);
|
||||
|
||||
// do not prepend BOS because we have a system prompt!
|
||||
std::vector<llama_token> tokens_prompt;
|
||||
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
|
||||
|
||||
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
||||
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
|
||||
common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
|
||||
}
|
||||
|
||||
// extract the logits only for the last token
|
||||
@@ -341,9 +341,9 @@ int main(int argc, char ** argv) {
|
||||
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
||||
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
||||
|
||||
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
|
||||
const llama_token id = common_sampler_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
|
||||
|
||||
llama_sampling_accept(client.ctx_sampling, ctx, id, true);
|
||||
common_sampler_accept(client.ctx_sampling, ctx, id, true);
|
||||
|
||||
if (client.n_decoded == 1) {
|
||||
// start measuring generation time after the first token to make sure all concurrent clients
|
||||
@@ -351,7 +351,7 @@ int main(int argc, char ** argv) {
|
||||
client.t_start_gen = ggml_time_us();
|
||||
}
|
||||
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
|
||||
client.response += token_str;
|
||||
client.sampled = id;
|
||||
@@ -371,8 +371,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
||||
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
|
||||
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
||||
llama_memory_seq_rm(ctx, client.id + 1, -1, -1);
|
||||
llama_memory_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
|
||||
@@ -126,17 +126,17 @@ int main(int argc, char ** argv) {
|
||||
const int ib = i/n_batch - 1;
|
||||
const int bd = n_batch_grp*(n_grp - 1);
|
||||
|
||||
llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_memory_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_memory_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
|
||||
llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
|
||||
common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
|
||||
}
|
||||
|
||||
if (i + n_batch >= n_tokens_all) {
|
||||
@@ -160,17 +160,17 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_memory_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_memory_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
|
||||
llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
|
||||
common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
|
||||
}
|
||||
|
||||
if (i + n_batch >= n_tokens_all) {
|
||||
@@ -191,8 +191,8 @@ int main(int argc, char ** argv) {
|
||||
if (n_discard > 0) {
|
||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_memory_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_memory_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
@@ -239,16 +239,16 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
LOG_TEE("%s", common_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
n_decode += 1;
|
||||
|
||||
// prepare the next batch
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// push this new token for next evaluation
|
||||
llama_batch_add(batch, new_token_id, n_past++, { 0 }, true);
|
||||
common_batch_add(batch, new_token_id, n_past++, { 0 }, true);
|
||||
}
|
||||
|
||||
n_cur += 1;
|
||||
|
||||
@@ -407,7 +407,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
@@ -582,7 +582,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
@@ -869,7 +869,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
||||
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
||||
|
||||
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
|
||||
//GGML_ASSERT(hs_cur.common_prefix >= ::common_tokenize(ctx, hs_cur.context, true).size());
|
||||
|
||||
// Delete the selected random example from the prompt
|
||||
if (randomize_tasks) {
|
||||
@@ -906,7 +906,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
size_t i1 = i0;
|
||||
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// batch as much tasks as possible into the available context
|
||||
// each task has 4 unique sequence ids - one for each ending
|
||||
@@ -922,7 +922,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
|
||||
llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
|
||||
common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
|
||||
}
|
||||
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
||||
n_logits += 1;
|
||||
@@ -932,7 +932,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
// TODO: don't evaluate the last token of each sequence
|
||||
for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
|
||||
const bool needs_logits = i < seq_tokens_size - 1;
|
||||
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
||||
common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
||||
n_logits += needs_logits;
|
||||
}
|
||||
}
|
||||
@@ -951,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
@@ -1191,7 +1191,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
size_t i1 = i0;
|
||||
size_t i_logits = 0;
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
|
||||
int n_logits = 0;
|
||||
@@ -1201,7 +1201,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < data[i1].common_prefix; ++i) {
|
||||
llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
|
||||
common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
|
||||
}
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
n_logits += 1;
|
||||
@@ -1209,7 +1209,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
for (int s = 0; s < 2; ++s) {
|
||||
// TODO: end before the last token, no need to predict past the end of the sequences
|
||||
for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
|
||||
llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
|
||||
common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
|
||||
n_logits += 1;
|
||||
}
|
||||
}
|
||||
@@ -1228,7 +1228,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
@@ -1547,7 +1547,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
size_t i1 = i0;
|
||||
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// batch as much tasks as possible into the available context
|
||||
// each task has 4 unique sequence ids - one for each ending
|
||||
@@ -1569,8 +1569,8 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
|
||||
|
||||
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
|
||||
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
|
||||
llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
|
||||
//common_batch_clear(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
|
||||
common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
|
||||
}
|
||||
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
||||
n_logits += 1;
|
||||
@@ -1580,7 +1580,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
// TODO: don't evaluate the last token of each sequence
|
||||
for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
|
||||
const bool needs_logits = i < seq_tokens_size - 1;
|
||||
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
||||
common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
||||
n_logits += needs_logits;
|
||||
}
|
||||
}
|
||||
@@ -1601,7 +1601,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
return;
|
||||
}
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
@@ -1787,7 +1787,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
|
||||
@@ -76,13 +76,13 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||
size_t n_tokens = tokens.size();
|
||||
for (size_t i = 0; i < n_tokens; i++) {
|
||||
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||
common_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||
}
|
||||
}
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
// run model
|
||||
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
@@ -204,7 +204,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
||||
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
||||
fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n\n");
|
||||
}
|
||||
@@ -232,7 +232,7 @@ int main(int argc, char ** argv) {
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
float * out = emb + p * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd);
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
p += s;
|
||||
s = 0;
|
||||
}
|
||||
@@ -266,7 +266,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<float> query_emb(n_embd, 0);
|
||||
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||
|
||||
llama_batch_clear(query_batch);
|
||||
common_batch_clear(query_batch);
|
||||
|
||||
// compute cosine similarities
|
||||
{
|
||||
|
||||
@@ -74,7 +74,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||
auto next_token_str = common_token_to_piece(ctx, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result0 += next_token_str;
|
||||
@@ -133,7 +133,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||
auto next_token_str = common_token_to_piece(ctx2, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result1 += next_token_str;
|
||||
@@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
// erase whole kv
|
||||
llama_kv_cache_clear(ctx3);
|
||||
llama_memory_clear(ctx3);
|
||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||
|
||||
// restore kv into seq 1
|
||||
@@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx3, &candidates_p);
|
||||
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||
auto next_token_str = common_token_to_piece(ctx3, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result2 += next_token_str;
|
||||
|
||||
@@ -211,12 +211,12 @@ size_t validate_utf8(const std::string& text) {
|
||||
return len;
|
||||
}
|
||||
|
||||
// TODO: reuse llama_detokenize
|
||||
// TODO: reuse common_token_to_piece
|
||||
template <class Iter>
|
||||
static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end) {
|
||||
std::string ret;
|
||||
for (; begin != end; ++begin) {
|
||||
ret += llama_token_to_piece(ctx, *begin);
|
||||
ret += common_token_to_piece(ctx, *begin);
|
||||
}
|
||||
|
||||
return ret;
|
||||
@@ -228,7 +228,7 @@ std::string tokens_to_str(llama_context* ctx, const llama_tokens& tokens) {
|
||||
|
||||
// format incomplete utf-8 multibyte character for output
|
||||
std::string tokens_to_output_formatted_string(const llama_context* ctx, const llama_token token) {
|
||||
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
||||
|
||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||
// (size > 1 meaning it's already a known token)
|
||||
@@ -372,8 +372,8 @@ common_prefix find_common_text_token_prefix(const llama_context* ctx, const llam
|
||||
llama_tokens a_sub(a.begin() + start, a.end());
|
||||
llama_tokens b_sub(b.begin() + start, b.end());
|
||||
|
||||
std::string a_str = llama_detokenize(ctx, a_sub, true);
|
||||
std::string b_str = llama_detokenize(ctx, b_sub, true);
|
||||
std::string a_str = common_token_to_piece(ctx, a_sub, true);
|
||||
std::string b_str = common_token_to_piece(ctx, b_sub, true);
|
||||
common_prefix string_prefix;
|
||||
|
||||
std::vector<size_t> a_list;
|
||||
@@ -1722,7 +1722,7 @@ server_tokens::server_tokens(const llama_tokens& tokens, bool has_mtmd) : has_mt
|
||||
text_tokens.push_back(t);
|
||||
}
|
||||
}
|
||||
return llama_detokenize(ctx, text_tokens, special);
|
||||
return common_token_to_piece(ctx, text_tokens, special);
|
||||
}
|
||||
|
||||
std::string server_tokens::detokenize(const llama_context* ctx, bool special, size_t start, size_t length) const {
|
||||
@@ -1744,7 +1744,7 @@ server_tokens::server_tokens(const llama_tokens& tokens, bool has_mtmd) : has_mt
|
||||
}
|
||||
++i;
|
||||
}
|
||||
return llama_detokenize(ctx, text_tokens, special);
|
||||
return common_token_to_piece(ctx, text_tokens, special);
|
||||
}
|
||||
|
||||
size_t server_tokens::find_n_from_tokens(const llama_context* ctx, const server_tokens& b, bool special,
|
||||
@@ -1812,7 +1812,7 @@ server_tokens::server_tokens(const llama_tokens& tokens, bool has_mtmd) : has_mt
|
||||
std::string endStr = think_token.end;
|
||||
|
||||
llama_tokens tokens = get_text_tokens();
|
||||
std::string str = llama_detokenize(ctx, tokens, true);
|
||||
std::string str = common_token_to_piece(ctx, tokens, true);
|
||||
|
||||
std::vector<std::pair<size_t, size_t>> results;
|
||||
// Find all positions of start and end
|
||||
|
||||
@@ -164,7 +164,7 @@ size_t common_part(const std::string& a, const std::string& b);
|
||||
// if validate_utf8(text) == text.size(), then the whole text is valid utf8
|
||||
size_t validate_utf8(const std::string& text);
|
||||
|
||||
// TODO: reuse llama_detokenize
|
||||
// TODO: reuse common_token_to_piece
|
||||
|
||||
std::string tokens_to_str(llama_context* ctx, const llama_tokens& tokens);
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ server_context::~server_context() {
|
||||
// Clear any sampling context
|
||||
for (server_slot& slot : slots) {
|
||||
if (slot.ctx_sampling != nullptr) {
|
||||
llama_sampling_free(slot.ctx_sampling);
|
||||
common_sampler_free(slot.ctx_sampling);
|
||||
}
|
||||
if (slot.ctx_dft) {
|
||||
llama_free(slot.ctx_dft);
|
||||
@@ -52,16 +52,16 @@ server_context::~server_context() {
|
||||
}
|
||||
|
||||
bool server_context::load_model(const gpt_params& params_) {
|
||||
params = params_;
|
||||
params_base = params_;
|
||||
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params_base);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
lora_adapters = llama_init.lora_adapters;
|
||||
|
||||
if (model == nullptr) {
|
||||
LOG_ERROR("unable to load model", { {"model", params.model} });
|
||||
LOG_ERROR("unable to load model", { {"model", params_base.model} });
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -70,26 +70,26 @@ bool server_context::load_model(const gpt_params& params_) {
|
||||
add_bos_token = llama_should_add_bos_token(model);
|
||||
has_eos_token = llama_add_eos_token(model) != 1;
|
||||
|
||||
chat_templates = common_chat_templates_init(model, params.chat_template);
|
||||
chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
||||
try {
|
||||
common_chat_format_example(chat_templates.get(), params.use_jinja, {});
|
||||
common_chat_format_example(chat_templates.get(), params_base.use_jinja, {});
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
LOG_WARNING("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
||||
chat_templates = common_chat_templates_init(model, "chatml");
|
||||
}
|
||||
|
||||
bool has_draft_model = !params.model_draft.empty() || !params.draft_params.empty();
|
||||
std::string& mmproj_path = params.mmproj.path;
|
||||
bool has_draft_model = !params_base.model_draft.empty() || !params_base.draft_params.empty();
|
||||
std::string& mmproj_path = params_base.mmproj.path;
|
||||
if (!mmproj_path.empty()) {
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
mparams.use_gpu = params.mmproj_use_gpu;
|
||||
mparams.use_gpu = params_base.mmproj_use_gpu;
|
||||
mparams.print_timings = false;
|
||||
mparams.n_threads = params.n_threads;
|
||||
mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
||||
mparams.image_min_tokens = params.image_min_tokens;
|
||||
mparams.image_max_tokens = params.image_max_tokens;
|
||||
mparams.n_threads = params_base.n_threads;
|
||||
mparams.flash_attn_type = params_base.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
||||
mparams.image_min_tokens = params_base.image_min_tokens;
|
||||
mparams.image_max_tokens = params_base.image_max_tokens;
|
||||
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
|
||||
if (mctx == nullptr) {
|
||||
LOG_ERROR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
|
||||
@@ -97,8 +97,8 @@ bool server_context::load_model(const gpt_params& params_) {
|
||||
}
|
||||
LOG_INFO("loaded multimodal model, '%s'\n", mmproj_path.c_str());
|
||||
|
||||
if (params.ctx_shift) {
|
||||
params.ctx_shift = false;
|
||||
if (params_base.ctx_shift) {
|
||||
params_base.ctx_shift = false;
|
||||
LOG_WARNING("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
|
||||
}
|
||||
|
||||
@@ -117,15 +117,15 @@ bool server_context::load_model(const gpt_params& params_) {
|
||||
LLAMA_LOG_INFO("\n\n==================================loading DRAFT model==================================\n\n");
|
||||
|
||||
gpt_params params_dft;
|
||||
params_dft.devices = params.devices_draft;
|
||||
params_dft.model = params.model_draft;
|
||||
params_dft.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
params_dft.rpc_servers = params.rpc_servers;
|
||||
params_dft.cache_type_k = params.cache_type_k_draft.empty() ? params.cache_type_k : params.cache_type_k_draft;
|
||||
params_dft.cache_type_v = params.cache_type_v_draft.empty() ? params.cache_type_v : params.cache_type_v_draft;
|
||||
params_dft.flash_attn = params.flash_attn;
|
||||
if (!params.draft_params.empty()) {
|
||||
auto [argc, argv] = parse_command_line("llama-server " + params.draft_params);
|
||||
params_dft.devices = params_base.devices_draft;
|
||||
params_dft.model = params_base.model_draft;
|
||||
params_dft.n_gpu_layers = params_base.n_gpu_layers_draft;
|
||||
params_dft.rpc_servers = params_base.rpc_servers;
|
||||
params_dft.cache_type_k = params_base.cache_type_k_draft.empty() ? params_base.cache_type_k : params_base.cache_type_k_draft;
|
||||
params_dft.cache_type_v = params_base.cache_type_v_draft.empty() ? params_base.cache_type_v : params_base.cache_type_v_draft;
|
||||
params_dft.flash_attn = params_base.flash_attn;
|
||||
if (!params_base.draft_params.empty()) {
|
||||
auto [argc, argv] = parse_command_line("llama-server " + params_base.draft_params);
|
||||
if (!gpt_params_parse(argc, argv, params_dft)) {
|
||||
gpt_params_print_usage(argc, argv, params_dft);
|
||||
free_command_line(argc, argv);
|
||||
@@ -135,16 +135,16 @@ bool server_context::load_model(const gpt_params& params_) {
|
||||
}
|
||||
LOG_INFO("", { {"model", params_dft.model} });
|
||||
if (params_dft.n_ctx == 0) {
|
||||
params_dft.n_ctx = params.n_ctx_draft;
|
||||
params_dft.n_ctx = params_base.n_ctx_draft;
|
||||
}
|
||||
params_dft.n_ctx = params_dft.n_ctx == 0 ? params.n_ctx / params.n_parallel : params_dft.n_ctx;
|
||||
params_dft.n_ctx = params_dft.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_dft.n_ctx;
|
||||
params_dft.n_parallel = 1;
|
||||
params_dft.n_batch = params_dft.n_ctx;
|
||||
llama_init_result llama_init_dft = llama_init_from_gpt_params(params_dft);
|
||||
|
||||
llama_model* model_dft = llama_init_dft.model;
|
||||
if (model_dft == nullptr) {
|
||||
LOG_ERROR("failed to load draft model", { {"model", params.model_draft} });
|
||||
LOG_ERROR("failed to load draft model", { {"model", params_base.model_draft} });
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -163,22 +163,22 @@ bool server_context::load_model(const gpt_params& params_) {
|
||||
}
|
||||
|
||||
void server_context::init() {
|
||||
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
||||
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
|
||||
|
||||
LOG_INFO("initializing slots", { {"n_slots", params.n_parallel} });
|
||||
LOG_INFO("initializing slots", { {"n_slots", params_base.n_parallel} });
|
||||
|
||||
for (int i = 0; i < params.n_parallel; i++) {
|
||||
for (int i = 0; i < params_base.n_parallel; i++) {
|
||||
server_slot slot;
|
||||
|
||||
slot.id = i;
|
||||
slot.ctx = ctx;
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.n_predict = params.n_predict;
|
||||
slot.n_predict = params_base.n_predict;
|
||||
slot.mctx = mctx;
|
||||
slot.cache_tokens.has_mtmd = mctx != nullptr;
|
||||
slot.params.think_tokens = params.think_tokens;
|
||||
if (params.think_tokens.exclude) {
|
||||
SRV_WRN("Exclude reasoning tokens when selecting slot based on similarity: start: %s, end: %s\nuse `--reasoning-tokens none` to disable.\n", params.think_tokens.begin.c_str(), params.think_tokens.end.c_str() );
|
||||
slot.params.think_tokens = params_base.think_tokens;
|
||||
if (params_base.think_tokens.exclude) {
|
||||
SRV_WRN("Exclude reasoning tokens when selecting slot based on similarity: start: %s, end: %s\nuse `--reasoning-tokens none` to disable.\n", params_base.think_tokens.begin.c_str(), params_base.think_tokens.end.c_str() );
|
||||
}
|
||||
else {
|
||||
SRV_WRN("%s", "Include reasoning tokens when selecting slot based on similarity\nuse `--reasoning-tokens auto` to exclude reasoning tokens.\n");
|
||||
@@ -188,8 +188,8 @@ void server_context::init() {
|
||||
{"n_ctx_slot", slot.n_ctx}
|
||||
});
|
||||
|
||||
const int ga_n = params.grp_attn_n;
|
||||
const int ga_w = params.grp_attn_w;
|
||||
const int ga_n = params_base.grp_attn_n;
|
||||
const int ga_w = params_base.grp_attn_w;
|
||||
|
||||
if (ga_n != 1) {
|
||||
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
|
||||
@@ -208,7 +208,7 @@ void server_context::init() {
|
||||
slot.ga_n = ga_n;
|
||||
slot.ga_w = ga_w;
|
||||
|
||||
slot.sparams = params.sparams;
|
||||
slot.sparams = params_base.sparams;
|
||||
|
||||
// Initialize speculative decoding if a draft model is loaded
|
||||
if (ctx_draft) {
|
||||
@@ -225,7 +225,7 @@ void server_context::init() {
|
||||
LOG_ERROR("failed to create speculator", {});
|
||||
return;
|
||||
}
|
||||
for (auto& pair : params.replacements_draft) {
|
||||
for (auto& pair : params_base.replacements_draft) {
|
||||
llama_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
|
||||
}
|
||||
|
||||
@@ -245,21 +245,21 @@ void server_context::init() {
|
||||
const int32_t n_batch = llama_n_batch(ctx);
|
||||
|
||||
// only a single seq_id per token is needed
|
||||
batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
|
||||
batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
|
||||
}
|
||||
|
||||
metrics.init();
|
||||
|
||||
if (params.cache_ram_mib != 0) {
|
||||
if (params.cache_ram_mib < 0) {
|
||||
if (params_base.cache_ram_mib != 0) {
|
||||
if (params_base.cache_ram_mib < 0) {
|
||||
LLAMA_LOG_INFO("prompt cache is enabled, size limit: %s\n", "no limit");
|
||||
}
|
||||
else {
|
||||
LLAMA_LOG_INFO("prompt cache is enabled, size limit: %d MiB\n", params.cache_ram_mib);
|
||||
LLAMA_LOG_INFO("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
|
||||
}
|
||||
LLAMA_LOG_INFO("%s", "use `--cache-ram 0` to disable the prompt cache\n");
|
||||
// only apply ram size limit. No token limit for now.
|
||||
prompt_cache = std::make_unique<server_prompt_cache>(ctx, params.cache_ram_mib, 0);
|
||||
prompt_cache = std::make_unique<server_prompt_cache>(ctx, params_base.cache_ram_mib, 0);
|
||||
}
|
||||
else {
|
||||
LLAMA_LOG_INFO("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
|
||||
@@ -268,14 +268,14 @@ void server_context::init() {
|
||||
// thinking is enabled if:
|
||||
// 1. It's not explicitly disabled (reasoning_budget == 0)
|
||||
// 2. The chat template supports it
|
||||
const bool enable_thinking = params.use_jinja && params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
||||
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
||||
//LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking);
|
||||
|
||||
oai_parser_opt = {
|
||||
/* use_jinja */ params.use_jinja,
|
||||
/* prefill_assistant */ params.prefill_assistant,
|
||||
/* reasoning_format */ params.reasoning_format,
|
||||
/* chat_template_kwargs */ params.default_template_kwargs,
|
||||
/* use_jinja */ params_base.use_jinja,
|
||||
/* prefill_assistant */ params_base.prefill_assistant,
|
||||
/* reasoning_format */ params_base.reasoning_format,
|
||||
/* chat_template_kwargs */ params_base.default_template_kwargs,
|
||||
/* common_chat_templates */ chat_templates.get(),
|
||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||
/* allow_audio */ mctx ? mtmd_support_audio(mctx) : false,
|
||||
@@ -500,34 +500,19 @@ size_t server_slot::find_stopping_strings(const std::string& text, const size_t
|
||||
|
||||
void server_slot::print_timings() const {
|
||||
char buffer[512];
|
||||
double t_token = t_prompt_processing / n_prompt_tokens_processed;
|
||||
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||
double t_prompt = t_prompt_processing / n_prompt_tokens_processed;
|
||||
double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||
|
||||
//snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||
// t_prompt_processing, n_prompt_tokens_processed,
|
||||
// t_token, n_tokens_second);
|
||||
double t_gen = t_token_generation / n_decoded;
|
||||
double n_gen_second = 1e3 / t_token_generation * n_decoded;
|
||||
|
||||
//LOG_INFO(buffer, {});
|
||||
|
||||
double t_token_gen = t_token_generation / n_decoded;
|
||||
double n_tokens_second_gen = 1e3 / t_token_generation * n_decoded;
|
||||
|
||||
//snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||
// t_token_generation, n_decoded,
|
||||
// t_token, n_tokens_second);
|
||||
|
||||
//LOG_INFO(buffer, {});
|
||||
|
||||
//snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||
|
||||
//LOG_INFO(buffer, {});
|
||||
SLT_INF(*this,
|
||||
"\n"
|
||||
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
||||
" eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
||||
" total time = %10.2f ms / %5d tokens\n",
|
||||
t_prompt_processing, n_prompt_tokens_processed, t_token, n_tokens_second,
|
||||
t_token_generation, n_decoded, t_token_gen, n_tokens_second_gen,
|
||||
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
|
||||
t_token_generation, n_decoded, t_gen, n_gen_second,
|
||||
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
||||
|
||||
if (n_draft_total > 0) {
|
||||
@@ -795,7 +780,7 @@ server_slot* server_context::get_available_slot(const server_task& task) {
|
||||
bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) {
|
||||
slot_params default_params;
|
||||
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
|
||||
llama_sampling_params default_sparams = params.sparams;
|
||||
llama_sampling_params default_sparams = params_base.sparams;
|
||||
auto& data = task.data;
|
||||
|
||||
if (data.count("__oaicompat") != 0) {
|
||||
@@ -848,9 +833,9 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
slot.params.post_sampling_probs = json_value(data, "post_sampling_probs", default_params.post_sampling_probs);
|
||||
|
||||
// speculative decoding parameters
|
||||
slot.params.speculative.n_max = json_value(data, "speculative.n_max", params.n_draft);
|
||||
slot.params.speculative.n_min = json_value(data, "speculative.n_min", params.n_draft_min);
|
||||
slot.params.speculative.p_min = json_value(data, "speculative.p_min", params.p_draft_min);
|
||||
slot.params.speculative.n_max = json_value(data, "speculative.n_max", params_base.n_draft);
|
||||
slot.params.speculative.n_min = json_value(data, "speculative.n_min", params_base.n_draft_min);
|
||||
slot.params.speculative.p_min = json_value(data, "speculative.p_min", params_base.p_draft_min);
|
||||
|
||||
// Clamp speculative parameters
|
||||
slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
|
||||
@@ -945,7 +930,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
if (penalty_prompt != data.end()) {
|
||||
if (penalty_prompt->is_string()) {
|
||||
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
|
||||
slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false);
|
||||
slot.sparams.penalty_prompt_tokens = common_tokenize(model, penalty_prompt_string, false);
|
||||
|
||||
if (slot.params.n_predict > 0) {
|
||||
slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict);
|
||||
@@ -988,7 +973,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
else {
|
||||
slot.params.oaicompat_chat_syntax.format = default_params.oaicompat_chat_syntax.format;
|
||||
}
|
||||
common_reasoning_format reasoning_format = params.reasoning_format;
|
||||
common_reasoning_format reasoning_format = params_base.reasoning_format;
|
||||
if (data.contains("reasoning_format")) {
|
||||
reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
|
||||
}
|
||||
@@ -1003,7 +988,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
const auto preserved_tokens = data.find("preserved_tokens");
|
||||
if (preserved_tokens != data.end()) {
|
||||
for (const auto& t : *preserved_tokens) {
|
||||
auto ids = llama_tokenize(model, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
|
||||
auto ids = common_tokenize(model, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
|
||||
if (ids.size() == 1) {
|
||||
LOG("Preserved token: %d\n", ids[0]);
|
||||
slot.sparams.preserved_tokens.insert(ids[0]);
|
||||
@@ -1020,7 +1005,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
server_grammar_trigger ct(t);
|
||||
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
||||
const auto& word = ct.value.value;
|
||||
auto ids = llama_tokenize(model, word, /* add_special= */ false, /* parse_special= */ true);
|
||||
auto ids = common_tokenize(model, word, /* add_special= */ false, /* parse_special= */ true);
|
||||
if (ids.size() == 1) {
|
||||
auto token = ids[0];
|
||||
if (std::find(slot.sparams.preserved_tokens.begin(), slot.sparams.preserved_tokens.end(), (llama_token)token) == slot.sparams.preserved_tokens.end()) {
|
||||
@@ -1085,7 +1070,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
}
|
||||
}
|
||||
else if (el[0].is_string()) {
|
||||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
|
||||
for (auto tok : toks) {
|
||||
slot.sparams.logit_bias[tok] = bias;
|
||||
}
|
||||
@@ -1128,9 +1113,9 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
|
||||
{
|
||||
if (slot.ctx_sampling != nullptr) {
|
||||
llama_sampling_free(slot.ctx_sampling);
|
||||
common_sampler_free(slot.ctx_sampling);
|
||||
}
|
||||
slot.ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), slot.sparams);
|
||||
slot.ctx_sampling = common_sampler_init(llama_get_model_vocab(model), slot.sparams);
|
||||
if (slot.ctx_sampling == nullptr) {
|
||||
// for now, the only error that may happen here is invalid grammar
|
||||
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
||||
@@ -1153,7 +1138,7 @@ void server_context::kv_cache_clear() {
|
||||
LOG_VERBOSE("clearing KV cache", {});
|
||||
|
||||
// clear the entire KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
clean_kv_cache = false;
|
||||
}
|
||||
|
||||
@@ -1174,10 +1159,10 @@ void server_context::system_prompt_update() {
|
||||
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int32_t j = 0; j < n_tokens; ++j) {
|
||||
llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
|
||||
common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
@@ -1187,8 +1172,8 @@ void server_context::system_prompt_update() {
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
for (int32_t i = 1; i <= params.n_parallel; ++i) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
for (int32_t i = 1; i <= params_base.n_parallel; ++i) {
|
||||
llama_memory_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1268,7 +1253,7 @@ bool server_context::process_token(completion_token_output& result, server_slot&
|
||||
}
|
||||
|
||||
// check the limits
|
||||
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) {
|
||||
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
|
||||
slot.stopped_limit = true;
|
||||
slot.has_next_token = false;
|
||||
|
||||
@@ -1297,7 +1282,7 @@ bool server_context::process_token(completion_token_output& result, server_slot&
|
||||
{ "slot.n_prompt_tokens", slot.n_prompt_tokens },
|
||||
{ "slot.n_decoded", slot.n_decoded },
|
||||
{ "slot.n_predict", slot.n_predict },
|
||||
{ "n_slots", params.n_parallel },
|
||||
{ "n_slots", params_base.n_parallel },
|
||||
{ "slot.n_ctx", slot.n_ctx },
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "n_ctx_train", n_ctx_train },
|
||||
@@ -1330,7 +1315,7 @@ void server_context::populate_token_probs(const server_slot& slot, completion_to
|
||||
size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
|
||||
if (post_sampling) {
|
||||
const auto* cur_p = llama_sampling_get_candidates(slot.ctx_sampling);
|
||||
const auto* cur_p = common_sampler_get_candidates(slot.ctx_sampling);
|
||||
const size_t max_probs = cur_p->size;
|
||||
|
||||
// set probability for sampled token
|
||||
@@ -1346,7 +1331,7 @@ void server_context::populate_token_probs(const server_slot& slot, completion_to
|
||||
for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
llama_detokenize(ctx, {cur_p->data[i].id}, special),
|
||||
common_token_to_piece(ctx, {cur_p->data[i].id}, special),
|
||||
cur_p->data[i].p
|
||||
});
|
||||
}
|
||||
@@ -1362,7 +1347,7 @@ void server_context::populate_token_probs(const server_slot& slot, completion_to
|
||||
for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
|
||||
result.probs.push_back({
|
||||
cur[i].id,
|
||||
llama_detokenize(ctx, {cur[i].id}, special),
|
||||
common_token_to_piece(ctx, {cur[i].id}, special),
|
||||
cur[i].p
|
||||
});
|
||||
}
|
||||
@@ -1387,7 +1372,7 @@ json server_context::get_formated_generation(const server_slot& slot) const {
|
||||
return json{
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"n_predict", slot.n_predict}, // Server configured n_predict
|
||||
{"model", params.model_alias},
|
||||
{"model", params_base.model_alias},
|
||||
{"seed", slot.sparams.seed},
|
||||
{"temperature", slot.sparams.temp},
|
||||
{"dynatemp_range", slot.sparams.dynatemp_range},
|
||||
@@ -1548,7 +1533,7 @@ void server_context::send_final_response(server_slot& slot) {
|
||||
{"generated_text", slot.generated_text}, // Always include full text for finish_reason logic
|
||||
{"id_slot", slot.id},
|
||||
{"stop", true},
|
||||
{"model", params.model_alias},
|
||||
{"model", params_base.model_alias},
|
||||
{"tokens_predicted", slot.n_decoded},
|
||||
{"tokens_evaluated", slot.n_prompt_tokens},
|
||||
{"generation_settings", get_formated_generation(slot)},
|
||||
@@ -1950,7 +1935,7 @@ void server_context::process_single_task(server_task&& task) {
|
||||
|
||||
// Erase token cache
|
||||
const size_t n_erased = slot->cache_tokens.size();
|
||||
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
|
||||
llama_memory_seq_rm(ctx, slot->id + 1, -1, -1);
|
||||
slot->cache_tokens.clear();
|
||||
|
||||
server_task_result result;
|
||||
@@ -2007,8 +1992,8 @@ void server_context::print_tokens(const server_tokens& prompt, const server_toke
|
||||
}
|
||||
|
||||
void server_context::discard_n_kv_and_cache_tokens(llama_context* ctx, server_slot& slot, int32_t n_keep, int32_t n_discard) {
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, n_keep, n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
llama_memory_seq_rm(ctx, slot.id, n_keep, n_keep + n_discard);
|
||||
llama_memory_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
if (slot.params.cache_prompt) {
|
||||
slot.cache_tokens.discard_n_tokens(n_keep, n_discard);
|
||||
}
|
||||
@@ -2067,12 +2052,8 @@ void server_context::context_shift_prompt(llama_context* ctx, server_slot& slot,
|
||||
slot.n_prompt_tokens = slot.prompt_tokens.size();
|
||||
}
|
||||
|
||||
void server_context::update_slots() {
|
||||
if (system_need_update) {
|
||||
system_prompt_update();
|
||||
}
|
||||
|
||||
// release slots
|
||||
void server_context::release_slots()
|
||||
{
|
||||
for (auto& slot : slots) {
|
||||
if (slot.command == SLOT_COMMAND_RELEASE) {
|
||||
slot.state = SLOT_STATE_IDLE;
|
||||
@@ -2092,11 +2073,10 @@ void server_context::update_slots() {
|
||||
queue_tasks.notify_slot_changed();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check if all slots are idle
|
||||
{
|
||||
bool server_context::slots_idle(){
|
||||
bool all_idle = true;
|
||||
|
||||
for (auto& slot : slots) {
|
||||
if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) {
|
||||
all_idle = false;
|
||||
@@ -2109,27 +2089,16 @@ void server_context::update_slots() {
|
||||
if (system_prompt.empty() && clean_kv_cache) {
|
||||
kv_cache_clear();
|
||||
}
|
||||
|
||||
return;
|
||||
all_idle = true;
|
||||
}
|
||||
}
|
||||
return all_idle;
|
||||
}
|
||||
|
||||
{
|
||||
LOG_VERBOSE("posting NEXT_RESPONSE", {});
|
||||
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_NEXT_RESPONSE;
|
||||
task.id_target = -1;
|
||||
|
||||
queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
// apply context-shift if needed
|
||||
// TODO: simplify and improve
|
||||
void server_context::context_shift() {
|
||||
for (server_slot& slot : slots) {
|
||||
if (slot.ga_n == 1) {
|
||||
if (slot.is_processing() && (int)system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
|
||||
if (!params.ctx_shift) {
|
||||
if (!params_base.ctx_shift) {
|
||||
// this check is redundant (for good)
|
||||
// we should never get here, because generation should already stopped in process_token()
|
||||
send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
|
||||
@@ -2176,15 +2145,9 @@ void server_context::update_slots() {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// start populating the batch for this iteration
|
||||
llama_batch_clear(batch);
|
||||
|
||||
auto accept_special_token = [&](server_slot& slot, llama_token token) {
|
||||
return params.special || slot.sparams.preserved_tokens.find(token) != slot.sparams.preserved_tokens.end();
|
||||
};
|
||||
|
||||
// frist, add sampled tokens from any ongoing sequences
|
||||
void server_context::add_sampled_tokens() {
|
||||
for (auto& slot : slots) {
|
||||
if (slot.state == SLOT_STATE_IDLE) {
|
||||
continue;
|
||||
@@ -2209,7 +2172,7 @@ void server_context::update_slots() {
|
||||
|
||||
// add the sampled token to the batch
|
||||
slot.i_batch_dft.push_back(batch.n_tokens);
|
||||
llama_batch_add(batch, slot.sampled, slot.cache_tokens.pos_next(), { slot.id }, true);
|
||||
common_batch_add(batch, slot.sampled, slot.cache_tokens.pos_next(), { slot.id }, true);
|
||||
slot.cache_tokens.push_back(slot.sampled);
|
||||
|
||||
if (slot.params.speculative.n_min > (int)draft.size()) {
|
||||
@@ -2226,7 +2189,7 @@ void server_context::update_slots() {
|
||||
// add all drafted tokens to the batch
|
||||
for (size_t i = 0; i < draft.size(); i++) {
|
||||
slot.i_batch_dft.push_back(batch.n_tokens);
|
||||
llama_batch_add(batch, draft[i], slot.cache_tokens.pos_next(), { slot.id }, true);
|
||||
common_batch_add(batch, draft[i], slot.cache_tokens.pos_next(), { slot.id }, true);
|
||||
slot.cache_tokens.push_back(draft[i]);
|
||||
}
|
||||
slot.drafted = std::move(draft);
|
||||
@@ -2236,7 +2199,7 @@ void server_context::update_slots() {
|
||||
// no speculative decoding
|
||||
slot.i_batch = batch.n_tokens;
|
||||
|
||||
llama_batch_add(batch, slot.sampled, slot.cache_tokens.pos_next(), { slot.id }, true);
|
||||
common_batch_add(batch, slot.sampled, slot.cache_tokens.pos_next(), { slot.id }, true);
|
||||
|
||||
slot.cache_tokens.push_back(slot.sampled);
|
||||
|
||||
@@ -2245,18 +2208,10 @@ void server_context::update_slots() {
|
||||
}
|
||||
slot.n_past = slot.cache_tokens.n_tokens();
|
||||
}
|
||||
}
|
||||
|
||||
// process in chunks of params.n_batch
|
||||
int32_t n_batch = llama_n_batch(ctx);
|
||||
int32_t n_ubatch = llama_n_ubatch(ctx);
|
||||
|
||||
// track if this is an embedding or non-embedding batch
|
||||
// if we've added sampled tokens above, we are in non-embedding mode
|
||||
// -1: none, 0: non-embedding, 1: embedding
|
||||
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
|
||||
|
||||
// next, batch any pending prompts without exceeding n_batch
|
||||
if (params.cont_batching || batch.n_tokens == 0) {
|
||||
void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t n_batch, int32_t & batch_type) {
|
||||
if (params_base.cont_batching || batch.n_tokens == 0) {
|
||||
for (auto& slot : slots) {
|
||||
// this slot still has a prompt to be processed
|
||||
if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
|
||||
@@ -2275,8 +2230,8 @@ void server_context::update_slots() {
|
||||
if (slot.infill) {
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
bool suff_rm_leading_spc = true;
|
||||
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
if (params_base.input_suffix.find_first_of(' ') == 0 && params_base.input_suffix.size() > 1) {
|
||||
params_base.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
|
||||
@@ -2291,8 +2246,8 @@ void server_context::update_slots() {
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
||||
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
|
||||
|
||||
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
||||
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
||||
auto embd_inp = params_base.spm_infill ? suffix_tokens : prefix_tokens;
|
||||
auto embd_end = params_base.spm_infill ? prefix_tokens : suffix_tokens;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||
}
|
||||
@@ -2350,7 +2305,7 @@ void server_context::update_slots() {
|
||||
// if input prompt is too big, truncate it (if group attention self-extend is disabled)
|
||||
// context shift for prompt processing
|
||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) {
|
||||
if (!params.ctx_shift) {
|
||||
if (!params_base.ctx_shift) {
|
||||
send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_SERVER);
|
||||
slot.release();
|
||||
continue;
|
||||
@@ -2389,7 +2344,7 @@ void server_context::update_slots() {
|
||||
else {
|
||||
slot.n_discarded_prompt = 0;
|
||||
}
|
||||
llama_sampling_reset(llama_get_model_vocab(model), slot.ctx_sampling);
|
||||
common_sampler_reset(llama_get_model_vocab(model), slot.ctx_sampling);
|
||||
|
||||
if (!slot.params.cache_prompt) {
|
||||
slot.n_past_se = 0;
|
||||
@@ -2424,7 +2379,7 @@ void server_context::update_slots() {
|
||||
|
||||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (int i = 0; i < slot.n_past; ++i) {
|
||||
llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
|
||||
common_sampler_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2471,14 +2426,14 @@ void server_context::update_slots() {
|
||||
slot.cache_tokens.keep_first(slot.n_past);
|
||||
int p0 = (int)system_tokens.size() + slot.n_past;
|
||||
p0 = system_tokens.size() + slot.cache_tokens.pos_next();
|
||||
if (!llama_kv_cache_seq_rm(ctx, slot.id, p0, -1)) {
|
||||
if (!llama_memory_seq_rm(ctx, slot.id, p0, -1)) {
|
||||
// could not partially delete (likely using a non-Transformer model)
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
|
||||
llama_memory_seq_rm(ctx, slot.id, -1, -1);
|
||||
|
||||
p0 = (int)system_tokens.size();
|
||||
if (p0 != 0) {
|
||||
// copy over the system prompt when there is one
|
||||
llama_kv_cache_seq_cp(ctx, 0, slot.id, -1, -1);
|
||||
llama_memory_seq_cp(ctx, 0, slot.id, -1, -1);
|
||||
}
|
||||
|
||||
// there is no common part left (except for the system prompt)
|
||||
@@ -2486,7 +2441,7 @@ void server_context::update_slots() {
|
||||
slot.n_past_se = 0;
|
||||
slot.ga_i = 0;
|
||||
// TODO: is the system prompt ever in the sampling context?
|
||||
llama_sampling_reset(llama_get_model_vocab(model), slot.ctx_sampling);
|
||||
common_sampler_reset(llama_get_model_vocab(model), slot.ctx_sampling);
|
||||
}
|
||||
|
||||
LOG_INFO("kv cache rm [p0, end)", {
|
||||
@@ -2546,7 +2501,7 @@ void server_context::update_slots() {
|
||||
}
|
||||
|
||||
int p0 = system_tokens.size() + slot.cache_tokens.pos_next();
|
||||
llama_batch_add(batch, cur_tok, p0, { slot.id }, slot.embedding);
|
||||
common_batch_add(batch, cur_tok, p0, { slot.id }, slot.embedding);
|
||||
|
||||
slot.cache_tokens.push_back(cur_tok);
|
||||
|
||||
@@ -2571,11 +2526,11 @@ void server_context::update_slots() {
|
||||
|
||||
GGML_ASSERT(batch.n_tokens > 0);
|
||||
GGML_ASSERT((size_t)slot.n_prompt_tokens == slot.prompt_tokens.size());
|
||||
llama_sampling_reset(llama_get_model_vocab(model), slot.ctx_sampling);
|
||||
common_sampler_reset(llama_get_model_vocab(model), slot.ctx_sampling);
|
||||
for (int i = 0; i < slot.n_prompt_tokens; ++i) {
|
||||
llama_token id = slot.prompt_tokens[i];
|
||||
if (id != LLAMA_TOKEN_NULL) {
|
||||
llama_sampling_accept(slot.ctx_sampling, ctx, id, false);
|
||||
common_sampler_accept(slot.ctx_sampling, ctx, id, false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2599,51 +2554,111 @@ void server_context::update_slots() {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
LOG_VERBOSE("no tokens to decode", {});
|
||||
return;
|
||||
void server_context::extend_context(const int32_t n_tokens) {
|
||||
for (auto& slot : slots) {
|
||||
if (slot.ga_n != 1) {
|
||||
// context extension via Self-Extend
|
||||
// TODO: simplify and/or abstract this
|
||||
while (slot.n_past_se >= slot.ga_i + slot.ga_w) {
|
||||
const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
|
||||
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
|
||||
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_memory_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_memory_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
|
||||
llama_memory_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
|
||||
|
||||
slot.n_past_se -= bd;
|
||||
|
||||
slot.ga_i += slot.ga_w / slot.ga_n;
|
||||
|
||||
LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
||||
}
|
||||
|
||||
slot.n_past_se += n_tokens;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOG_VERBOSE("decoding batch", {
|
||||
{"n_tokens", batch.n_tokens},
|
||||
});
|
||||
void server_context::handle_decode_result(const int ret) {
|
||||
|
||||
// make sure we're in the right embedding mode
|
||||
llama_set_embeddings(ctx, batch_type == 1);
|
||||
}
|
||||
|
||||
// process the created batch of tokens
|
||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
void server_context::speculative_decoding_accept() {
|
||||
for (auto& slot : slots) {
|
||||
if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch_dft.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (auto& slot : slots) {
|
||||
if (slot.ga_n != 1) {
|
||||
// context extension via Self-Extend
|
||||
// TODO: simplify and/or abstract this
|
||||
while (slot.n_past_se >= slot.ga_i + slot.ga_w) {
|
||||
const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
|
||||
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
|
||||
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
|
||||
size_t n_draft = slot.drafted.size();
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
// the accepted tokens from the speculation
|
||||
const auto ids = common_sampler_sample_and_accept_n(slot.ctx_sampling, ctx, slot.i_batch_dft, slot.drafted);
|
||||
slot.i_batch_dft.clear();
|
||||
slot.drafted.clear();
|
||||
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
|
||||
slot.n_past += ids.size();
|
||||
slot.n_decoded += ids.size();
|
||||
const int64_t t_current = ggml_time_us();
|
||||
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
|
||||
|
||||
slot.n_past_se -= bd;
|
||||
// update how many tokens out of those tested were accepted
|
||||
slot.n_draft_accepted += ids.size() - 1;
|
||||
|
||||
slot.ga_i += slot.ga_w / slot.ga_n;
|
||||
// rollback to the state before sampling the draft tokens
|
||||
slot.cache_tokens.keep_first(slot.cache_tokens.n_tokens() - n_draft);
|
||||
// add accepted tokens to the prompt
|
||||
slot.cache_tokens.insert({ ids.begin(), ids.end() - 1 });
|
||||
slot.sampled = ids.back(); // last accepted token
|
||||
slot.n_past = slot.cache_tokens.n_tokens();
|
||||
llama_memory_seq_rm(ctx, slot.id, slot.n_past, -1);
|
||||
|
||||
LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
||||
}
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
completion_token_output result;
|
||||
|
||||
slot.n_past_se += n_tokens;
|
||||
result.tok = ids[i];
|
||||
result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
|
||||
result.prob = 1.0f; // set later
|
||||
|
||||
if (slot.sparams.n_probs > 0) {
|
||||
populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, i);
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
// release slot because of stop condition
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
break;
|
||||
}
|
||||
}
|
||||
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int)ids.size() - 1, (int)slot.drafted.size(), slot.n_past);
|
||||
LOG_VERBOSE("speculative decoding result", {
|
||||
{"id_slot", slot.id},
|
||||
{"accepted", (int)ids.size() - 1},
|
||||
{"total", (int)slot.drafted.size()},
|
||||
{"new_n_past", slot.n_past}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool server_context::accept_special_token(const server_slot& slot, const llama_token token) {
|
||||
return params_base.special || slot.sparams.preserved_tokens.find(token) != slot.sparams.preserved_tokens.end();
|
||||
};
|
||||
|
||||
void server_context::process_batch_tokens(int32_t & n_batch) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
extend_context(n_tokens);
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
@@ -2668,7 +2683,8 @@ void server_context::update_slots() {
|
||||
{"n_batch", ret},
|
||||
{"ret", ret},
|
||||
});
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
|
||||
{"i", i},
|
||||
{"n_batch", ret},
|
||||
@@ -2684,12 +2700,9 @@ void server_context::update_slots() {
|
||||
LLAMA_LOG_INFO("n_past = %d\n", (int)slot.cache_tokens.size());
|
||||
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
|
||||
}
|
||||
|
||||
}
|
||||
break; // break loop of n_batch
|
||||
}
|
||||
|
||||
|
||||
// retry with half the batch size to try to find a free slot in the KV cache
|
||||
n_batch /= 2;
|
||||
i -= n_batch;
|
||||
@@ -2703,10 +2716,6 @@ void server_context::update_slots() {
|
||||
continue; // continue loop of n_batch
|
||||
}
|
||||
|
||||
// technically, measuring the time here excludes the sampling time for the last batch
|
||||
// but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
for (auto& slot : slots) {
|
||||
if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) {
|
||||
continue; // continue loop of slots
|
||||
@@ -2725,9 +2734,9 @@ void server_context::update_slots() {
|
||||
continue; // sample using speculative decoding
|
||||
}
|
||||
const int tok_idx = slot.i_batch - i;
|
||||
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, tok_idx);
|
||||
const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, NULL, tok_idx);
|
||||
|
||||
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
|
||||
common_sampler_accept(slot.ctx_sampling, ctx, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
|
||||
@@ -2739,15 +2748,14 @@ void server_context::update_slots() {
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
//slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
|
||||
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
|
||||
|
||||
result.tok = id;
|
||||
result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
|
||||
result.text_to_send = llama_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
|
||||
result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
|
||||
|
||||
if (slot.sparams.n_probs > 0) {
|
||||
populate_token_probs(slot, result, slot.params.post_sampling_probs, params.special, tok_idx);
|
||||
populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, tok_idx);
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
@@ -2761,64 +2769,67 @@ void server_context::update_slots() {
|
||||
}
|
||||
|
||||
// speculative decoding - main model sample and accept
|
||||
for (auto& slot : slots) {
|
||||
if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch_dft.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t n_draft = slot.drafted.size();
|
||||
|
||||
// the accepted tokens from the speculation
|
||||
const auto ids = llama_sampling_sample_and_accept_n(slot.ctx_sampling, ctx, slot.i_batch_dft, slot.drafted);
|
||||
slot.i_batch_dft.clear();
|
||||
slot.drafted.clear();
|
||||
|
||||
slot.n_past += ids.size();
|
||||
slot.n_decoded += ids.size();
|
||||
|
||||
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
|
||||
|
||||
// update how many tokens out of those tested were accepted
|
||||
slot.n_draft_accepted += ids.size() - 1;
|
||||
|
||||
// rollback to the state before sampling the draft tokens
|
||||
slot.cache_tokens.keep_first(slot.cache_tokens.n_tokens() - n_draft);
|
||||
// slot.n_past -= n_draft;
|
||||
// add accepted tokens to the prompt
|
||||
slot.cache_tokens.insert({ ids.begin(), ids.end() - 1 });
|
||||
slot.sampled = ids.back(); // last accepted token
|
||||
slot.n_past = slot.cache_tokens.n_tokens();
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
|
||||
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
completion_token_output result;
|
||||
|
||||
result.tok = ids[i];
|
||||
result.text_to_send = llama_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
|
||||
result.prob = 1.0f; // set later
|
||||
|
||||
if (slot.sparams.n_probs > 0) {
|
||||
populate_token_probs(slot, result, slot.params.post_sampling_probs, params.special, i);
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
// release slot because of stop condition
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
break;
|
||||
}
|
||||
}
|
||||
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int)ids.size() - 1, (int)slot.drafted.size(), slot.n_past);
|
||||
LOG_VERBOSE("speculative decoding result", {
|
||||
{"id_slot", slot.id},
|
||||
{"accepted", (int)ids.size() - 1},
|
||||
{"total", (int)slot.drafted.size()},
|
||||
{"new_n_past", slot.n_past}
|
||||
});
|
||||
}
|
||||
speculative_decoding_accept();
|
||||
}
|
||||
}
|
||||
|
||||
void server_context::update_slots() {
|
||||
if (system_need_update) {
|
||||
system_prompt_update();
|
||||
}
|
||||
// release slots
|
||||
release_slots();
|
||||
|
||||
// check if all slots are idle
|
||||
if (slots_idle()) {
|
||||
return;
|
||||
}
|
||||
|
||||
{
|
||||
LOG_VERBOSE("posting NEXT_RESPONSE", {});
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_NEXT_RESPONSE;
|
||||
task.id_target = -1;
|
||||
|
||||
queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
// apply context-shift if needed
|
||||
// TODO: simplify and improve
|
||||
context_shift();
|
||||
|
||||
// start populating the batch for this iteration
|
||||
common_batch_clear(batch);
|
||||
|
||||
// frist, add sampled tokens from any ongoing sequences
|
||||
add_sampled_tokens();
|
||||
|
||||
// process in chunks of params.n_batch
|
||||
int32_t n_batch = llama_n_batch(ctx);
|
||||
int32_t n_ubatch = llama_n_ubatch(ctx);
|
||||
|
||||
// track if this is an embedding or non-embedding batch
|
||||
// if we've added sampled tokens above, we are in non-embedding mode
|
||||
// -1: none, 0: non-embedding, 1: embedding
|
||||
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
|
||||
|
||||
// next, batch any pending prompts without exceeding n_batch
|
||||
batch_pending_prompt(n_ubatch, n_batch, batch_type);
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
LOG_VERBOSE("no tokens to decode", {});
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_VERBOSE("decoding batch", {
|
||||
{"n_tokens", batch.n_tokens},
|
||||
});
|
||||
|
||||
// make sure we're in the right embedding mode
|
||||
llama_set_embeddings(ctx, batch_type == 1);
|
||||
|
||||
// process the created batch of tokens
|
||||
process_batch_tokens(n_batch);
|
||||
|
||||
LOG_VERBOSE("run slots completed", {});
|
||||
}
|
||||
|
||||
@@ -184,7 +184,7 @@ struct server_context {
|
||||
llama_context* ctx = nullptr;
|
||||
std::vector<llama_lora_adapter_container> lora_adapters;
|
||||
|
||||
gpt_params params;
|
||||
gpt_params params_base;
|
||||
|
||||
llama_batch batch;
|
||||
|
||||
@@ -297,5 +297,25 @@ struct server_context {
|
||||
|
||||
void update_slots();
|
||||
|
||||
void release_slots();
|
||||
|
||||
bool slots_idle();
|
||||
|
||||
void context_shift();
|
||||
|
||||
void add_sampled_tokens();
|
||||
|
||||
void batch_pending_prompt(const int32_t n_ubatch, const int32_t n_batch, int32_t & batch_type);
|
||||
|
||||
void process_batch_tokens(int32_t & n_batch);
|
||||
|
||||
void extend_context(const int32_t n_tokens);
|
||||
|
||||
void handle_decode_result(const int ret);
|
||||
|
||||
void speculative_decoding_accept();
|
||||
|
||||
bool accept_special_token(const server_slot& slot, const llama_token token);
|
||||
|
||||
json model_meta() const;
|
||||
};
|
||||
|
||||
@@ -593,7 +593,7 @@ int main(int argc, char ** argv) {
|
||||
});
|
||||
|
||||
LOG_INFO("chat template", {
|
||||
{"chat_example", common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params.use_jinja, {}).c_str()
|
||||
{"chat_example", common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja, {}).c_str()
|
||||
},
|
||||
{"built_in", params.chat_template.empty()},
|
||||
});
|
||||
@@ -990,15 +990,15 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
json data = {
|
||||
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
||||
{ "model_alias", ctx_server.params.model_alias },
|
||||
{ "model_path", ctx_server.params.model},
|
||||
{ "model_alias", ctx_server.params_base.model_alias },
|
||||
{ "model_path", ctx_server.params_base.model},
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params.n_parallel },
|
||||
{ "model_name", get_model_name(ctx_server.params.model)},
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "model_name", get_model_name(ctx_server.params_base.model)},
|
||||
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
||||
{ "bos_token", llama_token_to_piece(ctx_server.ctx, llama_token_bos(ctx_server.model), /* special= */ true)},
|
||||
{ "eos_token", llama_token_to_piece(ctx_server.ctx, llama_token_eos(ctx_server.model), /* special= */ true)},
|
||||
{ "model_path", ctx_server.params.model },
|
||||
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_token_bos(ctx_server.model), /* special= */ true)},
|
||||
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_token_eos(ctx_server.model), /* special= */ true)},
|
||||
{ "model_path", ctx_server.params_base.model },
|
||||
{ "modalities", json {
|
||||
{"vision", ctx_server.oai_parser_opt.allow_image},
|
||||
{"audio", ctx_server.oai_parser_opt.allow_audio},
|
||||
@@ -1007,7 +1007,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
};
|
||||
|
||||
if (ctx_server.params.use_jinja) {
|
||||
if (ctx_server.params_base.use_jinja) {
|
||||
if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
|
||||
data["chat_template_tool_use"] = tool_use_src;
|
||||
}
|
||||
@@ -1026,8 +1026,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
json data = {
|
||||
{ "model_name", get_model_name(ctx_server.params.model)},
|
||||
{ "model_path", ctx_server.params.model },
|
||||
{ "model_name", get_model_name(ctx_server.params_base.model)},
|
||||
{ "model_path", ctx_server.params_base.model },
|
||||
{ "modalities", json {
|
||||
{"vision", ctx_server.oai_parser_opt.allow_image},
|
||||
{"audio", ctx_server.oai_parser_opt.allow_audio},
|
||||
@@ -1088,7 +1088,7 @@ int main(int argc, char ** argv) {
|
||||
// OAI-compat
|
||||
task.params.oaicompat = oaicompat;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
task.params.oaicompat_model = get_model_name(ctx_server.params.model);
|
||||
task.params.oaicompat_model = get_model_name(ctx_server.params_base.model);
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
@@ -1350,7 +1350,7 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_embeddings_impl = [&ctx_server](const httplib::Request& req, httplib::Response& res, oaicompat_type oaicompat) {
|
||||
if (!ctx_server.params.embedding) {
|
||||
if (!ctx_server.params_base.embedding) {
|
||||
res_err(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
fprintf(stderr, "%s", common_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
@@ -89,7 +89,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// evaluate the initial prompt
|
||||
for (size_t i = 0; i < tokens_list.size(); i++) {
|
||||
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||
common_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
@@ -132,14 +132,14 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
LOG_TEE("%s", common_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// prepare the next batch
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// push this new token for next evaluation
|
||||
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
||||
common_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
||||
|
||||
n_decode += 1;
|
||||
}
|
||||
|
||||
@@ -124,8 +124,8 @@ int main(int argc, char ** argv) {
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
|
||||
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||
llama_token_to_piece(ctx_dft, i).c_str());
|
||||
common_token_to_piece(ctx_tgt, i).c_str(),
|
||||
common_token_to_piece(ctx_dft, i).c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
|
||||
fprintf(stderr, "%s", common_token_to_piece(ctx_tgt, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
|
||||
bool has_eos = false;
|
||||
|
||||
// target model sampling context
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model_tgt), params.sparams);
|
||||
struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model_tgt), params.sparams);
|
||||
|
||||
// draft sequence data
|
||||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
@@ -191,7 +191,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
drafts[s].ctx_sampling = llama_sampling_init(llama_get_model_vocab(model_dft), params.sparams);
|
||||
drafts[s].ctx_sampling = common_sampler_init(llama_get_model_vocab(model_dft), params.sparams);
|
||||
}
|
||||
|
||||
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
||||
@@ -277,13 +277,13 @@ int main(int argc, char ** argv) {
|
||||
s_keep = s;
|
||||
accept = true;
|
||||
token_id = drafts[s].tokens[i_dft];
|
||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||
token_str = common_token_to_piece(ctx_tgt, token_id);
|
||||
common_sampler_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||
|
||||
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||
break;
|
||||
} else {
|
||||
LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
||||
LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
||||
drafts[s].active = false;
|
||||
|
||||
// calculate residual probability
|
||||
@@ -333,8 +333,8 @@ int main(int argc, char ** argv) {
|
||||
// sample from the target model
|
||||
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
|
||||
token_id = llama_sample_token(ctx_tgt, &dist_tgt);
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||
common_sampler_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||
token_str = common_token_to_piece(ctx_tgt, token_id);
|
||||
}
|
||||
|
||||
} else {
|
||||
@@ -342,13 +342,13 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// sample from the target model
|
||||
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
token_id = common_sampler_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||
common_sampler_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||
|
||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
||||
|
||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||
token_str = common_token_to_piece(ctx_tgt, token_id);
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
if (!drafts[s].active) {
|
||||
@@ -400,12 +400,12 @@ int main(int argc, char ** argv) {
|
||||
LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
||||
|
||||
llama_kv_cache_seq_keep(ctx_dft, s_keep);
|
||||
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
||||
llama_memory_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
||||
llama_kv_cache_seq_keep(ctx_dft, 0);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
|
||||
llama_memory_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
|
||||
llama_kv_cache_seq_keep(ctx_tgt, s_keep);
|
||||
llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
|
||||
llama_memory_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
|
||||
llama_kv_cache_seq_keep(ctx_tgt, 0);
|
||||
}
|
||||
|
||||
@@ -420,10 +420,10 @@ int main(int argc, char ** argv) {
|
||||
drafts[0].dists.push_back(std::vector<llama_token_data>());
|
||||
drafts[0].i_batch_tgt.push_back(0);
|
||||
|
||||
llama_batch_clear(batch_dft);
|
||||
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||
common_batch_clear(batch_dft);
|
||||
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
llama_memory_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
llama_decode(ctx_dft, batch_dft);
|
||||
|
||||
@@ -447,8 +447,8 @@ int main(int argc, char ** argv) {
|
||||
drafts[0].drafting = true;
|
||||
drafts[0].i_batch_dft = 0;
|
||||
|
||||
llama_batch_clear(batch_tgt);
|
||||
llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
|
||||
common_batch_clear(batch_tgt);
|
||||
common_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
|
||||
|
||||
// sample n_draft tokens from the draft model using tree-based sampling
|
||||
for (int i = 0; i < n_draft; ++i) {
|
||||
@@ -463,13 +463,13 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
|
||||
common_sampler_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
|
||||
|
||||
const auto & cur_p = drafts[s].ctx_sampling->cur;
|
||||
|
||||
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
|
||||
LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
|
||||
k, s, i, cur_p[k].id, cur_p[k].p, common_token_to_piece(ctx_dft, cur_p[k].id).c_str());
|
||||
}
|
||||
|
||||
std::vector<int> sa(1, s);
|
||||
@@ -479,8 +479,8 @@ int main(int argc, char ** argv) {
|
||||
if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
|
||||
LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
||||
llama_memory_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||
llama_memory_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
||||
|
||||
// all previous tokens from this branch are now also part of the new branch
|
||||
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
|
||||
@@ -519,7 +519,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const int s = sa[is];
|
||||
|
||||
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
|
||||
common_sampler_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
|
||||
|
||||
drafts[s].tokens.push_back(id);
|
||||
// save cur_p.data into drafts[s].dists
|
||||
@@ -528,12 +528,12 @@ int main(int argc, char ** argv) {
|
||||
// add unique drafted tokens to the target batch
|
||||
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
||||
|
||||
llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
|
||||
common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
|
||||
|
||||
// add the token to the batch for batched decoding with the draft model
|
||||
drafts[s].i_batch_dft = batch_dft.n_tokens;
|
||||
|
||||
llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
|
||||
common_batch_add(batch_dft, id, n_past_cur, { s }, true);
|
||||
|
||||
if (batch_tgt.n_tokens > n_draft) {
|
||||
drafts[s].drafting = false;
|
||||
@@ -560,7 +560,7 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
llama_kv_cache_seq_keep(ctx_tgt, 0);
|
||||
for (int s = 1; s < n_seq_dft; ++s) {
|
||||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
llama_memory_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||
@@ -599,9 +599,9 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("\ntarget:\n");
|
||||
llama_print_timings(ctx_tgt);
|
||||
|
||||
llama_sampling_free(ctx_sampling);
|
||||
common_sampler_free(ctx_sampling);
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
llama_sampling_free(drafts[s].ctx_sampling);
|
||||
common_sampler_free(drafts[s].ctx_sampling);
|
||||
}
|
||||
|
||||
llama_batch_free(batch_dft);
|
||||
|
||||
@@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// warm up
|
||||
if (params.warmup) {
|
||||
llama_batch_add(batch, bos, 0, { 0 }, false);
|
||||
common_batch_add(batch, bos, 0, { 0 }, false);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
@@ -117,13 +117,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
if (params.batch_warmup) {
|
||||
// clean up KV cache after generation
|
||||
llama_kv_cache_seq_rm(ctx, 0, params.n_ubatch, -1);
|
||||
llama_memory_seq_rm(ctx, 0, params.n_ubatch, -1);
|
||||
|
||||
// prepare batch of pp size for prompt processing performance measurement
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (unsigned int i = 0; i < params.n_ubatch; ++i) {
|
||||
llama_batch_add(batch, std::rand() % n_vocab, i, { 0 }, false);
|
||||
common_batch_add(batch, std::rand() % n_vocab, i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_ubatch)) {
|
||||
@@ -132,19 +132,19 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
llama_kv_cache_clear(ctx);
|
||||
common_batch_clear(batch);
|
||||
llama_memory_clear(ctx);
|
||||
|
||||
for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) {
|
||||
// clean up KV cache before generation
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
|
||||
llama_memory_seq_rm(ctx, 0, n_kv, -1);
|
||||
|
||||
// first measure token generation performance at this context size
|
||||
const auto t_tg_start = ggml_time_us();
|
||||
|
||||
for (unsigned int i = 0; i < tg; ++i) {
|
||||
llama_batch_clear(batch);
|
||||
llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true);
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
@@ -155,13 +155,13 @@ int main(int argc, char ** argv) {
|
||||
const auto t_tg_end = ggml_time_us();
|
||||
|
||||
// clean up KV cache after generation
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
|
||||
llama_memory_seq_rm(ctx, 0, n_kv, -1);
|
||||
|
||||
// prepare batch of pp size for prompt processing performance measurement
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (unsigned int i = 0; i < pp; ++i) {
|
||||
llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false);
|
||||
common_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false);
|
||||
}
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
|
||||
@@ -367,7 +367,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
const bool parse_special = !no_parse_special;
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
|
||||
tokens = ::common_tokenize(model, prompt, add_bos, parse_special);
|
||||
|
||||
if (printing_ids) {
|
||||
printf("[");
|
||||
@@ -382,7 +382,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
} else {
|
||||
bool invalid_utf8 = false;
|
||||
printf("%6d -> '", tokens[i]);
|
||||
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
||||
write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
||||
if (invalid_utf8) {
|
||||
printf("' (utf-8 decode failure)\n");
|
||||
} else {
|
||||
|
||||
@@ -614,7 +614,9 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
#ifndef NDEBUG
|
||||
printf("%s(%s -> %s)\n", __func__, src->name, dst->name);
|
||||
#endif
|
||||
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
||||
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
||||
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
|
||||
|
||||
@@ -763,7 +763,7 @@ extern "C" {
|
||||
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
||||
|
||||
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
||||
LLAMA_API void llama_kv_cache_clear(
|
||||
LLAMA_API void llama_memory_clear(
|
||||
struct llama_context * ctx);
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
@@ -771,7 +771,7 @@ extern "C" {
|
||||
// seq_id < 0 : match any sequence
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API bool llama_kv_cache_seq_rm(
|
||||
LLAMA_API bool llama_memory_seq_rm(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
@@ -781,7 +781,7 @@ extern "C" {
|
||||
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_cp(
|
||||
LLAMA_API void llama_memory_seq_cp(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id_src,
|
||||
llama_seq_id seq_id_dst,
|
||||
@@ -799,7 +799,7 @@ extern "C" {
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_add(
|
||||
LLAMA_API void llama_memory_seq_add(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
@@ -812,7 +812,7 @@ extern "C" {
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_div(
|
||||
LLAMA_API void llama_memory_seq_div(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
@@ -1124,7 +1124,7 @@ extern "C" {
|
||||
int32_t lstrip,
|
||||
bool special);
|
||||
|
||||
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
|
||||
/// @details Convert the provided tokens into text (inverse of common_tokenize()).
|
||||
/// @param text The char pointer must be large enough to hold the resulting text.
|
||||
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
|
||||
/// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
|
||||
|
||||
@@ -1576,7 +1576,7 @@ struct llama_vocab::impl {
|
||||
std::vector<token_data> id_to_token;
|
||||
|
||||
std::vector<llama_token> cache_special_tokens;
|
||||
std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
|
||||
std::vector<std::string> cache_token_to_piece; // common_token_to_piece(special = true);
|
||||
struct pair_hash {
|
||||
size_t operator()(const std::pair<std::string, std::string> & p) const {
|
||||
return std::hash<std::string>{}(p.first) ^ //create some hash for pair
|
||||
@@ -3639,7 +3639,7 @@ int32_t llama_vocab_token_to_piece(
|
||||
return vocab->token_to_piece(token, buf, length, lstrip, special);
|
||||
}
|
||||
|
||||
//int32_t llama_detokenize(
|
||||
//int32_t common_token_to_piece(
|
||||
// const struct llama_vocab * vocab,
|
||||
// const llama_token * tokens,
|
||||
// int32_t n_tokens,
|
||||
|
||||
@@ -5385,15 +5385,15 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
||||
return ctx->kv_self.used;
|
||||
}
|
||||
|
||||
void llama_kv_cache_clear(struct llama_context * ctx) {
|
||||
void llama_memory_clear(struct llama_context * ctx) {
|
||||
llama_kv_cache_clear(ctx->kv_self);
|
||||
}
|
||||
|
||||
bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
bool llama_memory_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
||||
}
|
||||
|
||||
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
||||
void llama_memory_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
||||
if (seq_id_src == seq_id_dst) {
|
||||
return;
|
||||
}
|
||||
@@ -5404,7 +5404,7 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
||||
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
||||
}
|
||||
|
||||
void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
||||
void llama_memory_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
||||
if (delta == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -5412,7 +5412,7 @@ void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, lla
|
||||
llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
|
||||
}
|
||||
|
||||
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||
void llama_memory_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||
if (d == 1) {
|
||||
return;
|
||||
}
|
||||
@@ -6039,9 +6039,9 @@ struct llama_data_read {
|
||||
|
||||
if (!res) {
|
||||
if (seq_id == -1) {
|
||||
llama_kv_cache_clear(ctx);
|
||||
llama_memory_clear(ctx);
|
||||
} else {
|
||||
llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
|
||||
llama_memory_seq_rm(ctx, seq_id, -1, -1);
|
||||
}
|
||||
throw std::runtime_error("failed to restore kv cache");
|
||||
}
|
||||
|
||||
@@ -802,7 +802,7 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
||||
}
|
||||
catch (const std::invalid_argument & /*ex*/) {
|
||||
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
|
||||
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond common_tokenize
|
||||
++offset;
|
||||
result.emplace_back(0xFFFD); // replacement character
|
||||
}
|
||||
|
||||
@@ -199,7 +199,7 @@ int main(int argc, char **argv) {
|
||||
|
||||
printf("\n");
|
||||
printf("src: '%s'\n", test_kv.first.c_str());
|
||||
printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
|
||||
printf("res: '%s'\n", common_token_to_piece(ctx, res).c_str());
|
||||
printf("tok: ");
|
||||
for (const auto & tok : res) {
|
||||
printf("%d ", tok);
|
||||
@@ -216,16 +216,16 @@ int main(int argc, char **argv) {
|
||||
if (!correct) {
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||
llama_detokenize(ctx, res).c_str(),
|
||||
llama_detokenize(ctx, test_kv.second).c_str());
|
||||
common_token_to_piece(ctx, res).c_str(),
|
||||
common_token_to_piece(ctx, test_kv.second).c_str());
|
||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||
for (const auto & t : test_kv.second) {
|
||||
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
||||
fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||
for (const auto & t : res) {
|
||||
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
||||
fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
@@ -272,7 +272,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
for (const auto & tok : res) {
|
||||
//ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
|
||||
//ofs << tok << " '" << string_strip(common_token_to_piece(ctx, std::vector<int>{tok})) << "'" << std::endl;
|
||||
ofs << tok << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user