mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 03:41:53 +00:00
Adding the XTC sampler (#486)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -649,6 +649,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||||||
sparams.mirostat_tau = std::stof(argv[i]);
|
sparams.mirostat_tau = std::stof(argv[i]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--xtc-probability") {
|
||||||
|
CHECK_ARG
|
||||||
|
sparams.xtc_probability = std::stof(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--xtc-threshold") {
|
||||||
|
CHECK_ARG
|
||||||
|
sparams.xtc_threshold = std::stof(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--cfg-negative-prompt") {
|
if (arg == "--cfg-negative-prompt") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
sparams.cfg_negative_prompt = argv[i];
|
sparams.cfg_negative_prompt = argv[i];
|
||||||
@@ -1635,6 +1645,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||||||
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
|
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
|
||||||
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
|
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
|
||||||
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
|
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
|
||||||
|
options.push_back({ "*", " --xtc-probability p", "xtc probability (default: %.1f, 0.0 = disabled)", (double)sparams.xtc_probability });
|
||||||
|
options.push_back({ "*", " --xtc-threshold t", "xtc threshold (default: %.1f, 0.0 = disabled)", (double)sparams.xtc_threshold});
|
||||||
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
|
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
|
||||||
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
||||||
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
|
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
|
||||||
@@ -3396,6 +3408,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||||
|
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
|
||||||
|
fprintf(stream, "xtc_threshold: %f # default: 0.0\n", sparams.xtc_threshold);
|
||||||
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
||||||
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
||||||
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
||||||
|
|||||||
@@ -121,10 +121,12 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
|||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f\n"
|
||||||
|
"\txtc_probability = %.3f, xtc_threshold = %.3f",
|
||||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
||||||
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
||||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
params.mirostat, params.mirostat_eta, params.mirostat_tau,
|
||||||
|
params.xtc_probability, params.xtc_threshold);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
}
|
}
|
||||||
@@ -153,6 +155,7 @@ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
|
|||||||
case llama_sampler_type::TOP_P: return "top_p";
|
case llama_sampler_type::TOP_P: return "top_p";
|
||||||
case llama_sampler_type::MIN_P: return "min_p";
|
case llama_sampler_type::MIN_P: return "min_p";
|
||||||
case llama_sampler_type::TEMPERATURE: return "temperature";
|
case llama_sampler_type::TEMPERATURE: return "temperature";
|
||||||
|
case llama_sampler_type::XTC : return "xtc";
|
||||||
default : return "";
|
default : return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -164,6 +167,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
|
|||||||
{"typical_p", llama_sampler_type::TYPICAL_P},
|
{"typical_p", llama_sampler_type::TYPICAL_P},
|
||||||
{"min_p", llama_sampler_type::MIN_P},
|
{"min_p", llama_sampler_type::MIN_P},
|
||||||
{"tfs_z", llama_sampler_type::TFS_Z},
|
{"tfs_z", llama_sampler_type::TFS_Z},
|
||||||
|
{"xtc", llama_sampler_type::XTC},
|
||||||
{"temperature", llama_sampler_type::TEMPERATURE}
|
{"temperature", llama_sampler_type::TEMPERATURE}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -178,6 +182,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
|
|||||||
{"min-p", llama_sampler_type::MIN_P},
|
{"min-p", llama_sampler_type::MIN_P},
|
||||||
{"tfs-z", llama_sampler_type::TFS_Z},
|
{"tfs-z", llama_sampler_type::TFS_Z},
|
||||||
{"tfs", llama_sampler_type::TFS_Z},
|
{"tfs", llama_sampler_type::TFS_Z},
|
||||||
|
{"xtc", llama_sampler_type::XTC},
|
||||||
{"temp", llama_sampler_type::TEMPERATURE}
|
{"temp", llama_sampler_type::TEMPERATURE}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -212,6 +217,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
|
|||||||
{'y', llama_sampler_type::TYPICAL_P},
|
{'y', llama_sampler_type::TYPICAL_P},
|
||||||
{'m', llama_sampler_type::MIN_P},
|
{'m', llama_sampler_type::MIN_P},
|
||||||
{'f', llama_sampler_type::TFS_Z},
|
{'f', llama_sampler_type::TFS_Z},
|
||||||
|
{'x', llama_sampler_type::XTC},
|
||||||
{'t', llama_sampler_type::TEMPERATURE}
|
{'t', llama_sampler_type::TEMPERATURE}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -240,6 +246,8 @@ static void sampler_queue(
|
|||||||
const float min_p = params.min_p;
|
const float min_p = params.min_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
|
const float xtc_probability = params.xtc_probability;
|
||||||
|
const float xtc_threshold = params.xtc_threshold;
|
||||||
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
||||||
|
|
||||||
for (auto sampler_type : samplers_sequence) {
|
for (auto sampler_type : samplers_sequence) {
|
||||||
@@ -249,6 +257,7 @@ static void sampler_queue(
|
|||||||
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||||
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||||
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||||
|
case llama_sampler_type::XTC : llama_sample_xtc (ctx_main, &cur_p, xtc_probability, xtc_threshold, min_keep); break;
|
||||||
case llama_sampler_type::TEMPERATURE:
|
case llama_sampler_type::TEMPERATURE:
|
||||||
if (dynatemp_range > 0) {
|
if (dynatemp_range > 0) {
|
||||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ enum class llama_sampler_type : char {
|
|||||||
TOP_P = 'p',
|
TOP_P = 'p',
|
||||||
MIN_P = 'm',
|
MIN_P = 'm',
|
||||||
TFS_Z = 'f',
|
TFS_Z = 'f',
|
||||||
|
XTC = 'x',
|
||||||
TYPICAL_P = 'y',
|
TYPICAL_P = 'y',
|
||||||
TEMPERATURE = 't'
|
TEMPERATURE = 't'
|
||||||
};
|
};
|
||||||
@@ -39,6 +40,8 @@ typedef struct llama_sampling_params {
|
|||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
float xtc_probability = 0.0f; // xtc probability
|
||||||
|
float xtc_threshold = 1.0f; // xtc threashold, disabled if > 0.5
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
|
||||||
|
|
||||||
|
|||||||
@@ -1208,6 +1208,14 @@ extern "C" {
|
|||||||
llama_token_data_array * candidates,
|
llama_token_data_array * candidates,
|
||||||
float temp);
|
float temp);
|
||||||
|
|
||||||
|
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
||||||
|
LLAMA_API void llama_sample_xtc(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_token_data_array * candidates_p,
|
||||||
|
float probability,
|
||||||
|
float threshold,
|
||||||
|
size_t min_keep);
|
||||||
|
|
||||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||||
|
|||||||
@@ -434,6 +434,40 @@ void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float probability, float threshold, size_t min_keep) {
|
||||||
|
if (probability < 0 || threshold > 0.5f || candidates->size < 2) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(smpl);
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
if (probability < 1) {
|
||||||
|
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
|
||||||
|
float chance = distribution(smpl->rng);
|
||||||
|
if (chance > probability) return;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sample_softmax_impl(nullptr, candidates);
|
||||||
|
|
||||||
|
auto cur_size = candidates->size;
|
||||||
|
|
||||||
|
int pos_last = 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
|
if (candidates->data[i].p >= threshold) {
|
||||||
|
pos_last = i;
|
||||||
|
} else break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (candidates->size - pos_last >= min_keep && pos_last > 0) {
|
||||||
|
candidates->data += pos_last;
|
||||||
|
candidates->size -= pos_last;
|
||||||
|
}
|
||||||
|
|
||||||
|
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
|
smpl->n_sample++;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sample_repetition_penalties_impl(
|
void llama_sample_repetition_penalties_impl(
|
||||||
struct llama_sampling * smpl,
|
struct llama_sampling * smpl,
|
||||||
llama_token_data_array * candidates,
|
llama_token_data_array * candidates,
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_
|
|||||||
void llama_sample_typical_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
|
void llama_sample_typical_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||||
void llama_sample_entropy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
|
void llama_sample_entropy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
|
||||||
void llama_sample_temp_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
|
void llama_sample_temp_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
|
||||||
|
void llama_sample_xtc_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float probability, float threshold, size_t min_keep);
|
||||||
|
|
||||||
void llama_sample_repetition_penalties_impl(
|
void llama_sample_repetition_penalties_impl(
|
||||||
struct llama_sampling * smpl,
|
struct llama_sampling * smpl,
|
||||||
|
|||||||
@@ -23265,6 +23265,11 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
|
|||||||
llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
|
llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates_p,
|
||||||
|
float probability, float threshold, size_t min_keep) {
|
||||||
|
llama_sample_xtc_impl(ctx ? &ctx->sampling : nullptr, candidates_p, probability, threshold, min_keep);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sample_repetition_penalties(
|
void llama_sample_repetition_penalties(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_token_data_array * candidates,
|
llama_token_data_array * candidates,
|
||||||
|
|||||||
Reference in New Issue
Block a user