Add Falcon-Edge support (#555)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-06-26 08:48:52 +02:00
committed by GitHub
parent b8402290ef
commit c8e6d9cfe7
3 changed files with 34 additions and 2 deletions

View File

@@ -393,6 +393,20 @@ struct llm_tokenizer_bpe {
"[0-9][0-9][0-9]",
};
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON_3:
regex_exprs = {
"[\\p{P}\\$\\+<=>\\^~\\|`]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"[0-9]",
};
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON_E:
regex_exprs = {
"[\\p{P}\\$\\+<=>\\^~\\|`]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"[0-9]",
};
break;
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: