Temporary workaround for tokenizers with undefined padding token

2026-04-20 06:19:00 +00:00 · 2023-11-30 09:01:57 +01:00
parent 343cec4fea
commit 020fa4fcae
2 changed files with 6 additions and 5 deletions
--- a/exllamav2/tokenizer.py
+++ b/exllamav2/tokenizer.py
@@ -118,7 +118,7 @@ class ExLlamaV2Tokenizer:
        # Some tokenizers use token ID zero for text but don't explicitly define a padding token but provide one anyway

        pad_test = self.tokenizer.piece_to_id("<pad>")
-        self.pad_token_id = pad_test or 0
+        self.pad_token_id = pad_test or self.eos_token_id

        # Special case if <unk> and <pad> have the same ID

--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -3,7 +3,8 @@ from exllamav2 import ExLlamaV2Config
 from exllamav2 import ExLlamaV2Tokenizer
 import random

-model_path = "/mnt/str/models/_exl2/deepseek-coder-1.3b"
+# model_path = "/mnt/str/models/_exl2/deepseek-coder-1.3b"
+model_path = "/mnt/str/models/_exl2/deepseek-67b-chat/"

 config = ExLlamaV2Config()
 config.model_dir = model_path
@@ -14,8 +15,7 @@ reference_tokenizer = AutoTokenizer.from_pretrained(model_path)

 # Bunch of text

-text = """
-
+xtext = """
 Following added tokens are encoded correctly but decoded incorrectly by the HF tokenizer:

 ö
@@ -33,8 +33,9 @@ Following added tokens are encoded correctly but decoded incorrectly by the HF t
 û

 (workaround seems to be working for now)
+"""

-
+text = """

 This command might take a bit of time if your corpus is very large, but for this dataset of 1.6 GB of texts it’s blazing fast (1 minute 16 seconds on an AMD Ryzen 9
 3900X CPU with 12 cores). Note that AutoTokenizer.train_new_from_iterator() only works if the tokenizer you are using is a “fast” tokenizer. As you’ll see in the next section,