mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-22 22:24:11 +00:00
Merge mainline llama.cpp (#3)
* Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -37,14 +37,15 @@ static gpt_params * g_params;
|
||||
static std::vector<llama_token> * g_input_tokens;
|
||||
static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
static bool is_interacting = false;
|
||||
static bool need_insert_eot = false;
|
||||
|
||||
static bool file_exists(const std::string &path) {
|
||||
static bool file_exists(const std::string & path) {
|
||||
std::ifstream f(path.c_str());
|
||||
return f.good();
|
||||
}
|
||||
|
||||
static bool file_is_empty(const std::string &path) {
|
||||
static bool file_is_empty(const std::string & path) {
|
||||
std::ifstream f;
|
||||
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
||||
@@ -99,7 +100,8 @@ static void write_logfile(
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting && g_params->interactive) {
|
||||
is_interacting = true;
|
||||
is_interacting = true;
|
||||
need_insert_eot = true;
|
||||
} else {
|
||||
console::cleanup();
|
||||
printf("\n");
|
||||
@@ -117,6 +119,15 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
|
||||
LOG_TEE("%s", text);
|
||||
}
|
||||
|
||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
||||
llama_chat_msg new_msg{role, content};
|
||||
auto formatted = llama_chat_format_single(
|
||||
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||
chat_msgs.push_back({role, content});
|
||||
LOG("formatted: %s\n", formatted.c_str());
|
||||
return formatted;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
g_params = ¶ms;
|
||||
@@ -190,6 +201,7 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
llama_context * ctx_guidance = NULL;
|
||||
std::vector<llama_chat_msg> chat_msgs;
|
||||
g_model = &model;
|
||||
g_ctx = &ctx;
|
||||
|
||||
@@ -215,6 +227,15 @@ int main(int argc, char ** argv) {
|
||||
__func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print chat template example in conversation mode
|
||||
if (params.conversation) {
|
||||
if (params.enable_chat_template) {
|
||||
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
||||
} else {
|
||||
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_TEE("\n");
|
||||
@@ -244,26 +265,38 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
if (!llama_model_has_encoder(model)) {
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
}
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
LOG("tokenize the prompt\n");
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
{
|
||||
auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
|
||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||
: params.prompt;
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
LOG("tokenize the prompt\n");
|
||||
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
|
||||
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
|
||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
LOG("prompt: \"%s\"\n", log_tostr(prompt));
|
||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
if (add_bos) {
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
} else {
|
||||
LOG_TEE("error: input is empty\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Tokenize negative prompt
|
||||
@@ -478,6 +511,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
console::set_display(console::prompt);
|
||||
@@ -500,6 +534,24 @@ int main(int argc, char ** argv) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
int enc_input_size = embd_inp.size();
|
||||
llama_token * enc_input_buf = embd_inp.data();
|
||||
|
||||
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
decoder_start_token_id = llama_token_bos(model);
|
||||
}
|
||||
|
||||
embd_inp.clear();
|
||||
embd_inp.push_back(decoder_start_token_id);
|
||||
}
|
||||
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
@@ -793,11 +845,20 @@ int main(int argc, char ** argv) {
|
||||
is_antiprompt = true;
|
||||
}
|
||||
|
||||
if (params.enable_chat_template) {
|
||||
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
||||
}
|
||||
is_interacting = true;
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
// if current token is not EOG, we add it to current assistant message
|
||||
if (params.conversation) {
|
||||
auto id = llama_sampling_last(ctx_sampling);
|
||||
assistant_ss << llama_token_to_piece(ctx, id, false);
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting) {
|
||||
LOG("waiting for user input\n");
|
||||
|
||||
@@ -848,12 +909,24 @@ int main(int argc, char ** argv) {
|
||||
string_process_escapes(buffer);
|
||||
}
|
||||
|
||||
bool format_chat = params.conversation && params.enable_chat_template;
|
||||
std::string user_inp = format_chat
|
||||
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||
: std::move(buffer);
|
||||
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
||||
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||
|
||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||
|
||||
// if user stop generation mid-way, we must add EOT to finish model's last response
|
||||
if (need_insert_eot && format_chat) {
|
||||
llama_token eot = llama_token_eot(model);
|
||||
embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
|
||||
need_insert_eot = false;
|
||||
}
|
||||
|
||||
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
|
||||
@@ -864,6 +937,9 @@ int main(int argc, char ** argv) {
|
||||
output_ss << llama_token_to_piece(ctx, token);
|
||||
}
|
||||
|
||||
// reset assistant message
|
||||
assistant_ss.str("");
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user