Fix some tokenization edge cases

This commit is contained in:
turboderp
2023-12-03 22:03:23 +01:00
parent 99f6ac3037
commit 38d393718d
3 changed files with 21 additions and 10 deletions

View File

@@ -201,6 +201,9 @@ class ExLlamaV2StreamingGenerator(ExLlamaV2BaseGenerator):
id_to_ord = self.tokenizer.get_id_to_ord_list()
b = [id_to_ord[x] for x in self.held_utf8_tokens[0].tolist()]
c = bytes(b).decode('utf-8')
except ValueError:
id_to_piece = self.tokenizer.get_id_to_piece_list()
c = "".join(id_to_piece[x] for x in self.held_utf8_tokens[0].tolist())
except UnicodeDecodeError:
c = "<EFBFBD>"

View File

@@ -32,8 +32,8 @@ class ExLlamaV2Tokenizer:
bos_token_id: int
eos_token_id: int
pad_token_id: int
newline_token_id: int
space_token_id: int
newline_token_id: int or None
space_token_id: int or None
id_to_ord: list = None
id_to_piece: list = None
@@ -94,8 +94,11 @@ class ExLlamaV2Tokenizer:
ok_tokens = []
for p, i in self.unspecial_piece_to_id.items():
itp = self.tokenizer.decode([i])
if itp == p: ok_tokens.append(p)
try:
itp = self.tokenizer.decode([i])
if itp == p: ok_tokens.append(p)
except IndexError:
pass
for t in ok_tokens: del self.unspecial_piece_to_id[t]
# Invert extended dictionaries
@@ -139,8 +142,10 @@ class ExLlamaV2Tokenizer:
# Useful token IDs
self.newline_token_id = self.tokenizer.encode(self.newline_token)[-1]
self.space_token_id = self.tokenizer.encode(self.space_token)[-1]
try: self.newline_token_id = self.tokenizer.encode(self.newline_token)[-1]
except: self.newline_token_id = None
try: self.space_token_id = self.tokenizer.encode(self.space_token)[-1]
except: self.space_token_id = None
# Create dictionaries on init

View File

@@ -4,15 +4,18 @@ from exllamav2 import ExLlamaV2Tokenizer
import random
# model_path = "/mnt/str/models/_exl2/deepseek-coder-1.3b"
model_path = "/mnt/str/models/_exl2/deepseek-67b-chat/"
# model_path = "/mnt/str/models/_exl2/deepseek-67b-chat/"
# model_path = "/mnt/str/models/_exl2/deepseek-coder-33b-instruct-exl2/6.0bpw/"
model_path = "/mnt/str/models/_exl2/qwen-72b-llamafied/"
reference_tokenizer = AutoTokenizer.from_pretrained(model_path)
# reference_tokenizer.save_pretrained("/mnt/str/temp/tok/tokenizer.json")
config = ExLlamaV2Config()
config.model_dir = model_path
config.prepare()
config.prepare(no_tensors = True)
exl2_tokenizer = ExLlamaV2Tokenizer(config)
reference_tokenizer = AutoTokenizer.from_pretrained(model_path)
# Bunch of text
xtext = """