mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-20 13:14:09 +00:00
spec : add self speculative decoding, ngram and refactor (#1261)
* spec : add self speculative decoding and ngram-mod and refactor common : use common_ prefix for common library function llama : use LLAMA_TOKEN_NULL spec : add self speculative decoding (no draft model required) + refactor spec : add ngram-mod spec : various improvements ton ngram-map + docs spec : fix the check-rate logic of ngram-simple common : add common_speculative_is_compat() spec : simplify time measurement using common_time_meas refactor common_sampler_init refactor common_token_to_piece refactor and fix cur_p bug clean up * spec : remove check rate * spec: show warnings instead of abort --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Sascha Rogmann <59577610+srogmann@users.noreply.github.com>
This commit is contained in:
@@ -352,7 +352,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
std::vector<llama_token> tokens = ::common_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
@@ -505,7 +505,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
std::vector<llama_token> tokens = ::common_tokenize(ctx, params.prompt, true);
|
||||
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
@@ -770,6 +770,8 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
|
||||
}
|
||||
|
||||
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
// Calculates hellaswag score (acc_norm) from prompt
|
||||
//
|
||||
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
||||
@@ -803,7 +805,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
size_t hs_task_count = prompt_lines.size()/6;
|
||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||
|
||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
||||
const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM;
|
||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||
|
||||
// The tasks should be randomized so the score stabilizes quickly.
|
||||
@@ -850,7 +852,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||
for (size_t j = 0; j < 4; j++) {
|
||||
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
||||
hs_cur.seq_tokens[j] = ::common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
||||
}
|
||||
|
||||
// determine the common prefix of the endings
|
||||
@@ -1143,8 +1145,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
||||
|
||||
for (auto & task : data) {
|
||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
||||
task.seq_tokens[0] = ::common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||
task.seq_tokens[1] = ::common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
||||
|
||||
task.common_prefix = 0;
|
||||
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
||||
@@ -1159,8 +1161,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
task.seq_tokens[0].size() - task.common_prefix +
|
||||
task.seq_tokens[1].size() - task.common_prefix;
|
||||
|
||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
|
||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||
task.n_base1 = ::common_tokenize(ctx, task.first + task.choices[0], true).size();
|
||||
task.n_base2 = ::common_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||
@@ -1366,7 +1368,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
||||
}
|
||||
return false;
|
||||
}
|
||||
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
|
||||
task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
|
||||
}
|
||||
auto min_len = task.seq_tokens.front().size();
|
||||
for (auto& seq : task.seq_tokens) {
|
||||
|
||||
Reference in New Issue
Block a user