mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-04-20 06:19:00 +00:00
Merge branch 'refs/heads/dev'
This commit is contained in:
@@ -402,6 +402,8 @@ class ExLlamaV2ArchParams:
|
||||
self.mmp.mlp_bias = True
|
||||
self.mmp.norm = "layernorm"
|
||||
|
||||
self.standard_calib_noise = (5, 30)
|
||||
|
||||
# Gemma
|
||||
|
||||
if arch_string == "GemmaForCausalLM":
|
||||
|
||||
@@ -186,16 +186,40 @@ class ExLlamaV2Embedding(ExLlamaV2Module):
|
||||
if self.archparams.normalize_embeddings:
|
||||
hidden_states *= cfg.hidden_size ** 0.5
|
||||
|
||||
# Negative tokens during quantization are noise tokens
|
||||
# Rows with negative tokens during quantization are noise tokens
|
||||
|
||||
if kwargs.get("negative_ids_noise"):
|
||||
mask = (input_ids < 0).unsqueeze(-1)
|
||||
unmasked_values = hidden_states[~mask.expand_as(hidden_states)].float()
|
||||
mean, std = unmasked_values.mean(), unmasked_values.std()
|
||||
noise = torch.randn_like(hidden_states, dtype = torch.float)
|
||||
noise = noise * std + mean
|
||||
noise = noise.half()
|
||||
hidden_states = torch.where(mask, noise, hidden_states)
|
||||
|
||||
n = 0
|
||||
mean = torch.tensor([0.0], dtype = torch.float, device = hidden_states.device)
|
||||
M2 = torch.tensor([0.0], dtype = torch.float, device = hidden_states.device)
|
||||
|
||||
for i in range(input_ids.shape[0]):
|
||||
if input_ids[i][0] < 0:
|
||||
continue
|
||||
|
||||
er = hidden_states[i].float()
|
||||
n += er.numel()
|
||||
delta = er - mean
|
||||
mean += delta.sum() / n
|
||||
delta2 = er - mean
|
||||
M2 += (delta * delta2).sum()
|
||||
del er
|
||||
del delta
|
||||
del delta2
|
||||
|
||||
if n > 1:
|
||||
std = torch.sqrt(M2 / (n - 1))
|
||||
|
||||
for i in range(input_ids.shape[0]):
|
||||
if input_ids[i][0] >= 0:
|
||||
continue
|
||||
|
||||
er = hidden_states[i]
|
||||
noise = torch.randn(er.size(), dtype = torch.float, device = hidden_states.device) * std + mean
|
||||
er.copy_(noise.half())
|
||||
del er
|
||||
del noise
|
||||
|
||||
# Move to pinned temp buffer for TP
|
||||
|
||||
|
||||
@@ -58,50 +58,50 @@ void rope_
|
||||
);
|
||||
}
|
||||
|
||||
long gen_mrope_pos_ids
|
||||
int64_t gen_mrope_pos_ids
|
||||
(
|
||||
torch::Tensor mrope_pos_ids,
|
||||
torch::Tensor ids,
|
||||
int merge_size,
|
||||
const std::vector<std::tuple<long, long>> &spans,
|
||||
const std::vector<std::tuple<long, long, long>> &grids
|
||||
const std::vector<std::tuple<int64_t, int64_t>> &spans,
|
||||
const std::vector<std::tuple<int64_t, int64_t, int64_t>> &grids
|
||||
)
|
||||
{
|
||||
int max_length = mrope_pos_ids.size(1);
|
||||
int in_length = ids.size(0);
|
||||
|
||||
long* in_ids = (long*) ids.data_ptr();
|
||||
long* pos_ids = (long*) mrope_pos_ids.data_ptr();
|
||||
int64_t* in_ids = (int64_t*) ids.data_ptr();
|
||||
int64_t* pos_ids = (int64_t*) mrope_pos_ids.data_ptr();
|
||||
|
||||
long* out_t = pos_ids;
|
||||
long* out_h = pos_ids + max_length;
|
||||
long* out_w = pos_ids + 2 * max_length;
|
||||
int64_t* out_t = pos_ids;
|
||||
int64_t* out_h = pos_ids + max_length;
|
||||
int64_t* out_w = pos_ids + 2 * max_length;
|
||||
|
||||
long base_t = 0;
|
||||
long next_base_t = 0;
|
||||
int64_t base_t = 0;
|
||||
int64_t next_base_t = 0;
|
||||
|
||||
for (int i = 0; i < max_length; ++i)
|
||||
{
|
||||
bool is_emb = false;
|
||||
if (i < in_length)
|
||||
{
|
||||
long id = in_ids[i];
|
||||
int64_t id = in_ids[i];
|
||||
|
||||
for (int j = 0; j < spans.size(); ++j)
|
||||
{
|
||||
long span_start = std::get<0>(spans[j]);
|
||||
long span_end = std::get<1>(spans[j]);
|
||||
long span = span_end - span_start;
|
||||
int64_t span_start = std::get<0>(spans[j]);
|
||||
int64_t span_end = std::get<1>(spans[j]);
|
||||
int64_t span = span_end - span_start;
|
||||
if (id >= span_start && id < span_end)
|
||||
{
|
||||
is_emb = true;
|
||||
long k = id - span_start;
|
||||
long grid_t = std::get<0>(grids[j]);
|
||||
long grid_h = std::get<1>(grids[j]) / (long)merge_size;
|
||||
long grid_w = std::get<2>(grids[j]) / (long)merge_size;
|
||||
long k_t = base_t + (k / grid_w / grid_h) % grid_t;
|
||||
long k_h = base_t + (k / grid_w) % grid_h;
|
||||
long k_w = base_t + k % grid_w;
|
||||
int64_t k = id - span_start;
|
||||
int64_t grid_t = std::get<0>(grids[j]);
|
||||
int64_t grid_h = std::get<1>(grids[j]) / (int64_t)merge_size;
|
||||
int64_t grid_w = std::get<2>(grids[j]) / (int64_t)merge_size;
|
||||
int64_t k_t = base_t + (k / grid_w / grid_h) % grid_t;
|
||||
int64_t k_h = base_t + (k / grid_w) % grid_h;
|
||||
int64_t k_w = base_t + k % grid_w;
|
||||
*out_t++ = k_t;
|
||||
*out_h++ = k_h;
|
||||
*out_w++ = k_w;
|
||||
|
||||
@@ -11,11 +11,11 @@ void rope_
|
||||
bool neox_style
|
||||
);
|
||||
|
||||
long gen_mrope_pos_ids
|
||||
int64_t gen_mrope_pos_ids
|
||||
(
|
||||
torch::Tensor mrope_pos_ids,
|
||||
torch::Tensor ids,
|
||||
int merge_size,
|
||||
const std::vector<std::tuple<long, long>> &spans,
|
||||
const std::vector<std::tuple<long, long, long>> &grids
|
||||
const std::vector<std::tuple<int64_t, int64_t>> &spans,
|
||||
const std::vector<std::tuple<int64_t, int64_t, int64_t>> &grids
|
||||
);
|
||||
@@ -2589,8 +2589,9 @@ class ExLlamaV2DynamicJob:
|
||||
self.generator.all_pages[0].backup()
|
||||
|
||||
for seq in self.sequences:
|
||||
for page in seq.allocated_pages:
|
||||
page.sub_ref()
|
||||
seq.allocated_pages = []
|
||||
if seq.allocated_pages is not None:
|
||||
for page in seq.allocated_pages:
|
||||
page.sub_ref()
|
||||
seq.allocated_pages = []
|
||||
|
||||
self.generator.validate_cache()
|
||||
|
||||
@@ -36,7 +36,7 @@ def gen_mrope_embed(
|
||||
|
||||
# Create 3D position IDs
|
||||
|
||||
ids = input_ids.squeeze(0)
|
||||
ids = input_ids.squeeze(0).contiguous()
|
||||
mrope_pos_ids = torch.zeros((3, max_length), dtype = torch.long).contiguous()
|
||||
merge_size = 1 if not embeddings else embeddings[0].model.config.vision_spatial_merge_size
|
||||
spans = []
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "0.2.5"
|
||||
__version__ = "0.2.6"
|
||||
Reference in New Issue
Block a user