mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-04-20 14:29:28 +00:00
Fix some tokenization edge cases
This commit is contained in:
@@ -201,6 +201,9 @@ class ExLlamaV2StreamingGenerator(ExLlamaV2BaseGenerator):
|
||||
id_to_ord = self.tokenizer.get_id_to_ord_list()
|
||||
b = [id_to_ord[x] for x in self.held_utf8_tokens[0].tolist()]
|
||||
c = bytes(b).decode('utf-8')
|
||||
except ValueError:
|
||||
id_to_piece = self.tokenizer.get_id_to_piece_list()
|
||||
c = "".join(id_to_piece[x] for x in self.held_utf8_tokens[0].tolist())
|
||||
except UnicodeDecodeError:
|
||||
c = "<EFBFBD>"
|
||||
|
||||
|
||||
@@ -32,8 +32,8 @@ class ExLlamaV2Tokenizer:
|
||||
bos_token_id: int
|
||||
eos_token_id: int
|
||||
pad_token_id: int
|
||||
newline_token_id: int
|
||||
space_token_id: int
|
||||
newline_token_id: int or None
|
||||
space_token_id: int or None
|
||||
|
||||
id_to_ord: list = None
|
||||
id_to_piece: list = None
|
||||
@@ -94,8 +94,11 @@ class ExLlamaV2Tokenizer:
|
||||
|
||||
ok_tokens = []
|
||||
for p, i in self.unspecial_piece_to_id.items():
|
||||
itp = self.tokenizer.decode([i])
|
||||
if itp == p: ok_tokens.append(p)
|
||||
try:
|
||||
itp = self.tokenizer.decode([i])
|
||||
if itp == p: ok_tokens.append(p)
|
||||
except IndexError:
|
||||
pass
|
||||
for t in ok_tokens: del self.unspecial_piece_to_id[t]
|
||||
|
||||
# Invert extended dictionaries
|
||||
@@ -139,8 +142,10 @@ class ExLlamaV2Tokenizer:
|
||||
|
||||
# Useful token IDs
|
||||
|
||||
self.newline_token_id = self.tokenizer.encode(self.newline_token)[-1]
|
||||
self.space_token_id = self.tokenizer.encode(self.space_token)[-1]
|
||||
try: self.newline_token_id = self.tokenizer.encode(self.newline_token)[-1]
|
||||
except: self.newline_token_id = None
|
||||
try: self.space_token_id = self.tokenizer.encode(self.space_token)[-1]
|
||||
except: self.space_token_id = None
|
||||
|
||||
# Create dictionaries on init
|
||||
|
||||
|
||||
@@ -4,15 +4,18 @@ from exllamav2 import ExLlamaV2Tokenizer
|
||||
import random
|
||||
|
||||
# model_path = "/mnt/str/models/_exl2/deepseek-coder-1.3b"
|
||||
model_path = "/mnt/str/models/_exl2/deepseek-67b-chat/"
|
||||
# model_path = "/mnt/str/models/_exl2/deepseek-67b-chat/"
|
||||
# model_path = "/mnt/str/models/_exl2/deepseek-coder-33b-instruct-exl2/6.0bpw/"
|
||||
model_path = "/mnt/str/models/_exl2/qwen-72b-llamafied/"
|
||||
|
||||
reference_tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
# reference_tokenizer.save_pretrained("/mnt/str/temp/tok/tokenizer.json")
|
||||
|
||||
config = ExLlamaV2Config()
|
||||
config.model_dir = model_path
|
||||
config.prepare()
|
||||
config.prepare(no_tensors = True)
|
||||
exl2_tokenizer = ExLlamaV2Tokenizer(config)
|
||||
|
||||
reference_tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
|
||||
# Bunch of text
|
||||
|
||||
xtext = """
|
||||
|
||||
Reference in New Issue
Block a user