mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
Server: refactor and rename functions (#1151)
* Server: rename functions and refactor code rename functions refactor update slots rename params_base rename timings * change * Revert kv cache name changes * Revert 2 * fix test build error --------- Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
@@ -14,11 +14,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||
|
||||
for (uint64_t i = 0; i < sentences.size(); i++) {
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
const std::string input_string = instruction + sentences[i];
|
||||
|
||||
std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
|
||||
std::vector<llama_token> inputs = common_tokenize(mdl, input_string, true, false);
|
||||
|
||||
const int32_t n_toks = inputs.size();
|
||||
|
||||
@@ -27,7 +27,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
// inputs.push_back(llama_token_eos(mdl));
|
||||
|
||||
// we want to ignore instruction tokens for mean pooling
|
||||
const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
|
||||
const int32_t n_inst = common_tokenize(mdl, instruction, true, false).size();
|
||||
|
||||
#ifdef GRIT_DEBUG
|
||||
// debug tokens - should be matching as referenced in the GritLM sample
|
||||
@@ -39,7 +39,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
|
||||
// add input to batch (this increments n_tokens)
|
||||
for (int32_t j = 0; j < n_toks; j++) {
|
||||
llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
||||
}
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
@@ -104,14 +104,14 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
||||
|
||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||
|
||||
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
||||
std::vector<llama_token> inputs = common_tokenize(mdl, prompt, false, true);
|
||||
int32_t i_current_token = 0;
|
||||
|
||||
while (true) {
|
||||
llama_batch_clear(bat);
|
||||
common_batch_clear(bat);
|
||||
auto n_inputs = (int32_t)inputs.size();
|
||||
for (int32_t i = 0; i < n_inputs; i++) {
|
||||
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
||||
common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
||||
}
|
||||
inputs.clear();
|
||||
|
||||
@@ -130,7 +130,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
||||
break;
|
||||
}
|
||||
|
||||
std::string piece = llama_token_to_piece(ctx, token);
|
||||
std::string piece = common_token_to_piece(ctx, token);
|
||||
if (stream) {
|
||||
std::printf("%s", piece.c_str());
|
||||
std::fflush(stdout);
|
||||
|
||||
Reference in New Issue
Block a user