mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-20 21:24:08 +00:00
* spec : add self speculative decoding and ngram-mod and refactor common : use common_ prefix for common library function llama : use LLAMA_TOKEN_NULL spec : add self speculative decoding (no draft model required) + refactor spec : add ngram-mod spec : various improvements ton ngram-map + docs spec : fix the check-rate logic of ngram-simple common : add common_speculative_is_compat() spec : simplify time measurement using common_time_meas refactor common_sampler_init refactor common_token_to_piece refactor and fix cur_p bug clean up * spec : remove check rate * spec: show warnings instead of abort --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Sascha Rogmann <59577610+srogmann@users.noreply.github.com>
42 lines
1.5 KiB
C++
42 lines
1.5 KiB
C++
#pragma once
|
|
|
|
#include "llama.h"
|
|
#include "common.h"
|
|
|
|
struct common_speculative;
|
|
|
|
// comma separated list of all types
|
|
std::string common_speculative_type_name_str();
|
|
|
|
// convert string to type
|
|
enum common_speculative_type common_speculative_type_from_name(const std::string & name);
|
|
|
|
// convert type to string
|
|
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
|
|
|
// check if the llama_context is compatible for speculative decoding
|
|
// note: clears the memory of the context
|
|
bool common_speculative_is_compat(llama_context * ctx_tgt);
|
|
|
|
common_speculative * common_speculative_init(
|
|
common_params_speculative & params,
|
|
llama_context * ctx_tgt);
|
|
|
|
void common_speculative_free(common_speculative * spec);
|
|
|
|
// optionally call once at the beginning of a new generation
|
|
void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt);
|
|
|
|
// sample up to n_draft tokens and add them to the batch using the draft model
|
|
llama_tokens common_speculative_draft(
|
|
common_speculative * spec,
|
|
const common_params_speculative & params,
|
|
const llama_tokens & prompt,
|
|
llama_token id_last);
|
|
|
|
// informs the speculative decoder that n_accepted tokens were accepted by the target model
|
|
void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
|
|
|
|
// print statistics about the speculative decoding
|
|
void common_speculative_print_stats(const common_speculative * spec);
|