mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-12 23:10:01 +00:00
When tokenizer info is missing in the model, use llama3 by default
This commit is contained in:
13
llama.cpp
13
llama.cpp
@@ -4882,14 +4882,21 @@ static void llm_load_vocab(
|
||||
// for now, only BPE models have pre-tokenizers
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
if (tokenizer_pre.empty()) {
|
||||
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
// OK - I don't feel like recreati8ng the LLaMA-v3 models. Considering that, at least for now,
|
||||
// LLaMA-v3 is the only model wehere we end up here, let's just force the pre-tokanizer to be
|
||||
// llama3.
|
||||
tokenizer_pre = "llama3";
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'llama3'\n", __func__);
|
||||
LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||||
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
||||
LLAMA_LOG_WARN("%s: GENERATION QUALITY MAY BE DEGRADED! \n", __func__);
|
||||
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
||||
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||||
LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
//vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
} else if (tokenizer_pre == "default") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
|
||||
Reference in New Issue
Block a user