Special handling of Seed Coder FIM tokens (#585)

* Special handling of Seed Coder FIM tokens

* vocab: Add Seed Coder pretokenizer

* Formatting fix

* Update llama.h
This commit is contained in:
Fizz~
2025-07-06 06:13:55 -04:00
committed by GitHub
parent 49d4d2630a
commit 27ff5bf57e
5 changed files with 27 additions and 0 deletions

View File

@@ -6302,6 +6302,10 @@ static void llm_load_vocab(
tokenizer_pre == "bailingmoe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "seed-coder") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
vocab.tokenizer_clean_spaces = false;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}