mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-04-20 06:19:00 +00:00
Merge remote-tracking branch 'origin/dev' into dev
This commit is contained in:
@@ -124,7 +124,10 @@ class ExLlamaV2Embedding(ExLlamaV2Module):
|
||||
standard_mask_ = standard_mask[i]
|
||||
input_ids_ = input_ids[i]
|
||||
standard_ids_ = input_ids_[standard_mask_]
|
||||
standard_embeddings_ = self.embedding(standard_ids_)
|
||||
if loras is not None and loras[0].embed_tokens is not None:
|
||||
standard_embeddings_ = loras[0].embed_tokens(standard_ids_)
|
||||
else:
|
||||
standard_embeddings_ = self.embedding(standard_ids_)
|
||||
standard_embeddings_ = safe_move_tensor(standard_embeddings_, indexed_embeddings.device)
|
||||
combined_embeddings[i][standard_mask_] = standard_embeddings_
|
||||
|
||||
@@ -144,7 +147,10 @@ class ExLlamaV2Embedding(ExLlamaV2Module):
|
||||
# Call embedding module if no indexed embeddings
|
||||
|
||||
else:
|
||||
hidden_states = self.embedding.forward(hidden_states)
|
||||
if loras is not None and loras[0].embed_tokens is not None:
|
||||
hidden_states = loras[0].embed_tokens(hidden_states)
|
||||
else:
|
||||
hidden_states = self.embedding(hidden_states)
|
||||
|
||||
if self.model.config.arch.normalize_embeddings:
|
||||
hidden_states *= self.model.config.hidden_size ** 0.5
|
||||
|
||||
@@ -241,6 +241,14 @@ class ExLlamaV2Linear(ExLlamaV2Module):
|
||||
|
||||
# Linear forward
|
||||
|
||||
if self.key == 'lm_head' and loras is not None and loras[0].lm_head is not None:
|
||||
hidden_states_out = loras[0].lm_head(hidden_states)
|
||||
|
||||
if intermediates:
|
||||
return {"hidden_states": hidden_states_out}
|
||||
else:
|
||||
return hidden_states_out
|
||||
|
||||
if self.q_handle is not None and not force_recons:
|
||||
|
||||
output_shape = hidden_states.shape[:-1] + (self.out_features,)
|
||||
|
||||
@@ -53,6 +53,8 @@ class ExLlamaV2Lora:
|
||||
self.target_modules = {}
|
||||
self.bias_ignored = False
|
||||
self.lora_scaling = lora_scaling
|
||||
self.embed_tokens = None
|
||||
self.lm_head = None
|
||||
|
||||
# Grab relevant items from LoRA config
|
||||
|
||||
@@ -77,6 +79,29 @@ class ExLlamaV2Lora:
|
||||
tensor = f[key]
|
||||
|
||||
# Find target
|
||||
if key.endswith(f'{self.config.arch.lm_head_key}.weight'):
|
||||
if tensor.dtype == torch.bfloat16:
|
||||
tensor = tensor.to(torch.float16)
|
||||
elif tensor.dtype == torch.float32:
|
||||
tensor = tensor.to(torch.float16)
|
||||
target_module = self.model.modules_dict["lm_head"]
|
||||
tensor = safe_move_tensor(tensor, target_module.device())
|
||||
self.lm_head = torch.nn.Linear(target_module.in_features, tensor.shape[0], bias = False, device = "meta")
|
||||
self.lm_head.weight = torch.nn.Parameter(tensor, requires_grad=False)
|
||||
continue
|
||||
elif key.endswith(f'embed_tokens.weight'):
|
||||
if tensor.dtype == torch.bfloat16:
|
||||
tensor = tensor.to(torch.float16)
|
||||
elif tensor.dtype == torch.float32:
|
||||
tensor = tensor.to(torch.float16)
|
||||
target_module = self.model.modules_dict["model.embed_tokens"]
|
||||
tensor = safe_move_tensor(tensor, target_module.device())
|
||||
self.embed_tokens = torch.nn.Embedding(tensor.shape[0], self.config.hidden_size, self.config.pad_token_id, device = "meta")
|
||||
weight = torch.nn.Parameter(tensor, requires_grad=False)
|
||||
if self.model.config.scale_emb != 1:
|
||||
weight *= self.model.config.scale_emb
|
||||
self.embed_tokens.weight = weight
|
||||
continue
|
||||
|
||||
i = key.find("model.layers.")
|
||||
if i == -1: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}")
|
||||
|
||||
Reference in New Issue
Block a user