diff --git a/conversion/adaptivegptq.py b/conversion/adaptivegptq.py index 9206482..4d9f631 100644 --- a/conversion/adaptivegptq.py +++ b/conversion/adaptivegptq.py @@ -465,6 +465,8 @@ class AdaptiveGPTQ: output[key + ".q_invperm"] = self.invperm.to(torch.int) output[key + ".q_scale_max"] = self.qscale_max output[key + ".q_groups"] = self.qgroups + if self.layer.bias is not None: + output[key + ".bias"] = self.layer.bias.data columns = self.columns rem_rows = self.rows diff --git a/conversion/measure.py b/conversion/measure.py index b7dd7be..758f7d7 100644 --- a/conversion/measure.py +++ b/conversion/measure.py @@ -91,8 +91,9 @@ def test_quant(source: ExLlamaV2Linear, variants = [] variants_bits = [] - original = nn.Linear(source.in_features, source.out_features, False, device = "meta", dtype = torch.float16) + original = nn.Linear(source.in_features, source.out_features, source.has_bias, device = "meta", dtype = torch.float16) original.weight = nn.Parameter(source.linear.weight.clone()) + if source.has_bias: original.bias.weight = nn.Parameter(source.linear.bias.clone()) for qp in qparams: @@ -102,10 +103,12 @@ def test_quant(source: ExLlamaV2Linear, quantized.to("cpu") variants.append(quantized) - total_bits = qp.total_bits(quantized.weight.T.shape) + total_bits = qp.total_bits(quantized.weight.T.shape, original.bias.weight.shape if source.has_bias else None) variants_bits.append(total_bits) - bpw = total_bits / quantized.weight.numel() + numel = quantized.weight.numel() + if source.has_bias: numel += original.bias.numel() + bpw = total_bits / numel desc = qp.desc print(f" -- {source.key:50} {desc:50} {bpw:2.2f} bpw") diff --git a/conversion/qparams.py b/conversion/qparams.py index c78a466..d25131b 100644 --- a/conversion/qparams.py +++ b/conversion/qparams.py @@ -57,7 +57,7 @@ class QParams: qp_dict["scale_bits"]) - def total_bits(self, shape): + def total_bits(self, shape, bias_shape = None): rows = shape[0] columns = shape[1] @@ -91,16 +91,26 @@ class QParams: total_bits += groups * columns * self.scale_bits # q_scale total_bits += rows * 32 # q_invperm + if bias_shape is not None: + bias_numel = 1 + for d in bias_shape: bias_numel *= d + total_bits += 16 * d + return total_bits - def bpw(self, shape): + def bpw(self, shape, bias_shape = None): rows = shape[0] columns = shape[1] numel = rows * columns - return self.total_bits(shape) / numel + if bias_shape is not None: + bias_numel = 1 + for d in bias_shape: bias_numel *= d + numel += d + + return self.total_bits(shape, bias_shape) / numel def get_desc(self, filename = False): diff --git a/conversion/quantize.py b/conversion/quantize.py index 01348e1..e1cac47 100644 --- a/conversion/quantize.py +++ b/conversion/quantize.py @@ -55,7 +55,7 @@ def quant_linear(job: dict, # Quantize lq.configure(qp.group_size, qp.bits, qp.bits_prop, qp.scale_bits) - lq.quantize(keep_qweight = True, apply = True, drop = drop) + lq.quantize(keep_qweight = True, apply = True) # Pack and save quantized layer @@ -69,10 +69,12 @@ def quant_linear(job: dict, # Reconstruct from packed layer - recons_linear = ExLlamaV2Linear(source.model, source.key, source.in_features, source.out_features, False) + recons_linear = ExLlamaV2Linear(source.model, source.key, source.in_features, source.out_features, source.has_bias) recons_linear.device_idx = source.device_idx recons_dict = {} - for k in ["q_weight", "q_invperm", "q_scale", "q_scale_max", "q_groups"]: + recons_keys = ["q_weight", "q_invperm", "q_scale", "q_scale_max", "q_groups"] + if source.has_bias: recons_keys += ["bias"] + for k in recons_keys: recons_dict[k] = packed_dict[source.key + "." + k] recons_dict["q_perm"] = torch.argsort(recons_dict["q_invperm"]).to(torch.int) recons_linear.load(recons_dict) @@ -86,6 +88,7 @@ def quant_linear(job: dict, recons_w2 = recons_linear.forward(ident, force_cuda = True) recons_w2.sub_(quant_w) + if recons_linear.has_bias: recons_w2.sub_(recons_dict["bias"]) recons_w2.abs_() diff2 = torch.max(recons_w2) diff --git a/exllamav2/attn.py b/exllamav2/attn.py index f3d872f..fe7da4a 100644 --- a/exllamav2/attn.py +++ b/exllamav2/attn.py @@ -163,10 +163,10 @@ class ExLlamaV2Attention(ExLlamaV2Module): else: self.input_layernorm = ExLlamaV2RMSNorm(model, key + ".input_layernorm") - self.q_proj = ExLlamaV2Linear(model, key + ".self_attn.q_proj", hidden_size, self.model.config.num_attention_heads * self.model.config.head_dim, False) - self.k_proj = ExLlamaV2Linear(model, key + ".self_attn.k_proj", hidden_size, self.model.config.num_key_value_heads * self.model.config.head_dim, False) - self.v_proj = ExLlamaV2Linear(model, key + ".self_attn.v_proj", hidden_size, self.model.config.num_key_value_heads * self.model.config.head_dim, False) - self.o_proj = ExLlamaV2Linear(model, key + ".self_attn.o_proj", self.model.config.num_attention_heads * self.model.config.head_dim, hidden_size, False) + self.q_proj = ExLlamaV2Linear(model, key + ".self_attn.q_proj", hidden_size, self.model.config.num_attention_heads * self.model.config.head_dim, self.model.config.attention_bias_qkv) + self.k_proj = ExLlamaV2Linear(model, key + ".self_attn.k_proj", hidden_size, self.model.config.num_key_value_heads * self.model.config.head_dim, self.model.config.attention_bias_qkv) + self.v_proj = ExLlamaV2Linear(model, key + ".self_attn.v_proj", hidden_size, self.model.config.num_key_value_heads * self.model.config.head_dim, self.model.config.attention_bias_qkv) + self.o_proj = ExLlamaV2Linear(model, key + ".self_attn.o_proj", self.model.config.num_attention_heads * self.model.config.head_dim, hidden_size, self.model.config.attention_bias_o) self.submodules = [self.input_layernorm, self.q_proj, diff --git a/exllamav2/config.py b/exllamav2/config.py index 72eac1b..a13a522 100644 --- a/exllamav2/config.py +++ b/exllamav2/config.py @@ -44,6 +44,8 @@ class ExLlamaV2Config: head_dim: int = 128 # Constant for all Llama models, except 3b num_experts: int = None num_experts_per_token: int = None + attention_bias_qkv: bool = False + attention_bias_o: bool = False checkpoint_fused_mlp: bool = False @@ -144,6 +146,16 @@ class ExLlamaV2Config: expect_keys += \ expect_keys_llama + if "Qwen2ForCausalLM" in read_config["architectures"]: + self.architecture = "Qwen2" + layer_keys += \ + layer_keys_llama_norms + \ + layer_keys_llama_attn + \ + layer_keys_llama_mlp + expect_keys += \ + expect_keys_llama + self.attention_bias_qkv = True + self.attention_bias_o = False else: print(f" !! Warning, unknown architecture: {repr(read_config['architectures'])}") @@ -167,6 +179,9 @@ class ExLlamaV2Config: self.num_hidden_layers = read_config["num_hidden_layers"] self.rms_norm_eps = read_config["rms_norm_eps"] self.vocab_size = read_config["vocab_size"] + if read_config.get("attention_bias", False): + self.attention_bias_qkv = True + self.attention_bias_o = True self.rotary_embedding_base = read_config["rope_theta"] if "rope_theta" in read_config else 10000.0 diff --git a/exllamav2/exllamav2_ext/cuda/h_add.cu b/exllamav2/exllamav2_ext/cuda/h_add.cu new file mode 100644 index 0000000..004593a --- /dev/null +++ b/exllamav2/exllamav2_ext/cuda/h_add.cu @@ -0,0 +1,51 @@ +#include "h_add.cuh" +#include "util.cuh" +#include "../config.h" +#include "matrix_view.cuh" + +#define NUM_THREADS_X 32 +#define NUM_THREADS_Y 16 + +__global__ void cuda_vector_add_kernel +( + half* __restrict__ dest, + const half* __restrict__ source, + const int height, + const int width +) +{ + MatrixView_half_rw dest_(dest, height, width); + MatrixView_half source_(source, 1, width); + + int offset_x = blockIdx.x * NUM_THREADS_X * 2 + threadIdx.x * 2; + if (offset_x >= width) return; + + int offset_y = blockIdx.y * NUM_THREADS_Y; + int end_y = min(offset_y + NUM_THREADS_Y, height); + + half2 v = source_.item_half2(0, offset_x); + for (int y = offset_y; y < end_y; ++y) + { + half2* ptr = (half2*) dest_.item_ptr(y, offset_x); + *ptr = __hadd2(v, *ptr); + } +} + +void cuda_vector_add_ +( + half* dest, + const half* source, + int height, + int width +) +{ + dim3 blockDim, gridDim; + blockDim.x = NUM_THREADS_X; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = DIVIDE(width, NUM_THREADS_X * 2); + gridDim.y = DIVIDE(height, NUM_THREADS_Y); + gridDim.z = 1; + + cuda_vector_add_kernel<<>>(dest, source, height, width); +} diff --git a/exllamav2/exllamav2_ext/cuda/h_add.cuh b/exllamav2/exllamav2_ext/cuda/h_add.cuh new file mode 100644 index 0000000..643c2b4 --- /dev/null +++ b/exllamav2/exllamav2_ext/cuda/h_add.cuh @@ -0,0 +1,18 @@ +#ifndef _h_add_cuh +#define _h_add_cuh + +#include +#include +#include +#include +#include + +void cuda_vector_add_ +( + half* dest, + const half* source, + int width, + int height +); + +#endif \ No newline at end of file diff --git a/exllamav2/exllamav2_ext/cuda/q_gemm.cu b/exllamav2/exllamav2_ext/cuda/q_gemm.cu index b0d28b2..33306fc 100644 --- a/exllamav2/exllamav2_ext/cuda/q_gemm.cu +++ b/exllamav2/exllamav2_ext/cuda/q_gemm.cu @@ -5,6 +5,7 @@ #define CLEAR_N_SIZE 256 #include "comp_units/kernel_select.cuh" +#include "h_add.cuh" void gemm_half_q_half_cuda_part ( @@ -165,6 +166,8 @@ void gemm_half_q_half_cuda int block_m = min(size_m, block_m_size_max); gemm_half_q_half_cuda_part(a, b, c, size_m, size_n, size_k, block_m, clear, r_weights, r_weights_stride, mul_r_weights); } + + if (b->cuda_bias) cuda_vector_add_(c, b->cuda_bias, size_m, size_n); } __global__ void clear_kernel diff --git a/exllamav2/exllamav2_ext/cuda/q_matrix.cu b/exllamav2/exllamav2_ext/cuda/q_matrix.cu index 7635954..9e7e64e 100644 --- a/exllamav2/exllamav2_ext/cuda/q_matrix.cu +++ b/exllamav2/exllamav2_ext/cuda/q_matrix.cu @@ -63,6 +63,8 @@ QMatrix::QMatrix half* _gptq_scales, uint32_t* _gptq_g_idx, + half* _bias, + half* _temp_dq ) : device(_device), @@ -84,6 +86,7 @@ QMatrix::QMatrix cuda_q_group_map = _q_group_map; cuda_gptq_qzeros = _gptq_qzeros; cuda_gptq_scales = _gptq_scales; + cuda_bias = _bias; is_gptq = (_gptq_qzeros != NULL); diff --git a/exllamav2/exllamav2_ext/cuda/q_matrix.cuh b/exllamav2/exllamav2_ext/cuda/q_matrix.cuh index 3097d9f..8838896 100644 --- a/exllamav2/exllamav2_ext/cuda/q_matrix.cuh +++ b/exllamav2/exllamav2_ext/cuda/q_matrix.cuh @@ -37,6 +37,7 @@ public: uint16_t* cuda_q_group_map = NULL; uint32_t* cuda_gptq_qzeros = NULL; half* cuda_gptq_scales = NULL; + half* cuda_bias = NULL; half* temp_dq; @@ -61,6 +62,8 @@ public: half* _gptq_scales, uint32_t* _gptq_g_idx, + half* bias, + half* _temp_dq ); diff --git a/exllamav2/exllamav2_ext/ext_qmatrix.cpp b/exllamav2/exllamav2_ext/ext_qmatrix.cpp index b4663bc..2970224 100644 --- a/exllamav2/exllamav2_ext/ext_qmatrix.cpp +++ b/exllamav2/exllamav2_ext/ext_qmatrix.cpp @@ -29,6 +29,7 @@ uintptr_t make_q_matrix torch::Tensor gptq_qzeros, torch::Tensor gptq_scales, torch::Tensor gptq_g_idx, + torch::Tensor bias, torch::Tensor temp_dq ) { @@ -42,6 +43,7 @@ uintptr_t make_q_matrix TORCH_CHECK_DTYPE_OPT(gptq_qzeros, kInt); TORCH_CHECK_DTYPE_OPT(gptq_scales, kHalf); TORCH_CHECK_DTYPE_OPT(gptq_g_idx, kInt); + TORCH_CHECK_DTYPE_OPT(bias, kHalf); TORCH_CHECK_SHAPES(q_perm, 0, q_invperm, 0, 1); @@ -65,6 +67,11 @@ uintptr_t make_q_matrix height = q_weight.size(0) * 8; } + if (!bias.device().is_meta()) + { + TORCH_CHECK_SHAPES(q_weight, 1, bias, 0, 1); + } + TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer") QMatrix* m = new QMatrix @@ -83,6 +90,7 @@ uintptr_t make_q_matrix gptq_qzeros.device().is_meta() ? NULL : (uint32_t*) gptq_qzeros.data_ptr(), gptq_scales.device().is_meta() ? NULL : (half*) gptq_scales.data_ptr(), gptq_g_idx.device().is_meta() ? NULL : (uint32_t*) gptq_g_idx.data_ptr(), + bias.device().is_meta() ? NULL : (half*) bias.data_ptr(), (half*) temp_dq.data_ptr() ); diff --git a/exllamav2/exllamav2_ext/ext_qmatrix.h b/exllamav2/exllamav2_ext/ext_qmatrix.h index 1c74d95..255131f 100644 --- a/exllamav2/exllamav2_ext/ext_qmatrix.h +++ b/exllamav2/exllamav2_ext/ext_qmatrix.h @@ -11,6 +11,7 @@ uintptr_t make_q_matrix torch::Tensor gptq_qzeros, torch::Tensor gptq_scales, torch::Tensor gptq_g_idx, + torch::Tensor bias, torch::Tensor temp_dq ); diff --git a/exllamav2/ext.py b/exllamav2/ext.py index 10d724c..3a532a5 100644 --- a/exllamav2/ext.py +++ b/exllamav2/ext.py @@ -117,6 +117,7 @@ if build_jit: "ext_rope.cpp", "ext_safetensors.cpp", "ext_sampling.cpp", + "cuda/h_add.cu", "cuda/h_gemm.cu", "cuda/lora.cu", "cuda/pack_tensor.cu", @@ -216,6 +217,7 @@ def make_q_matrix(w: dict, temp_dq, key: str = None): none_tensor, none_tensor, none_tensor, + w.get("bias", none_tensor), temp_dq) # GPTQ @@ -241,6 +243,7 @@ def make_q_matrix(w: dict, temp_dq, key: str = None): w["qzeros"], w["scales"], w["g_idx"].cpu(), + w.get("bias", none_tensor), temp_dq) # GPTQ without g_idx @@ -257,6 +260,7 @@ def make_q_matrix(w: dict, temp_dq, key: str = None): w["qzeros"], w["scales"], none_tensor, + w.get("bias", none_tensor), temp_dq) diff --git a/exllamav2/generator/streaming.py b/exllamav2/generator/streaming.py index 937ee8a..2e2d29b 100644 --- a/exllamav2/generator/streaming.py +++ b/exllamav2/generator/streaming.py @@ -199,6 +199,7 @@ class ExLlamaV2StreamingGenerator(ExLlamaV2BaseGenerator): return tuple(ret) + # @profile def _stream(self) -> (str, bool, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor): # Token healing @@ -240,9 +241,9 @@ class ExLlamaV2StreamingGenerator(ExLlamaV2BaseGenerator): self.settings.begin_filters() self.first_token = False - # Decode the current tail end of the sequence - - old_tail = self.tokenizer.decode(self.sequence_ids[:1, -self.tail_decode_tokens:])[0] + # # Decode the current tail end of the sequence + # + # old_tail = self.tokenizer.decode(self.sequence_ids[:1, -self.tail_decode_tokens:])[0] # Generate a single token and append to the sequence @@ -253,10 +254,13 @@ class ExLlamaV2StreamingGenerator(ExLlamaV2BaseGenerator): if next_token.item() in self.stop_tokens: return self.held_text, True, self.no_tokens, self.no_probs, self.no_ptokens, self.no_pprobs, self.no_logits - # Decode the tail end of the sequence with the added token to get (actual) characters added + # # Decode the tail end of the sequence with the added token to get (actual) characters added + # + # new_tail = self.tokenizer.decode(self.sequence_ids[:1, -(self.tail_decode_tokens + 1):])[0] + # new_text = new_tail[len(old_tail):] - new_tail = self.tokenizer.decode(self.sequence_ids[:1, -(self.tail_decode_tokens + 1):])[0] - new_text = new_tail[len(old_tail):] + piece_to_id = self.tokenizer.get_id_to_piece_list() + new_text = piece_to_id[self.sequence_ids[0, -1].item()] next_token, new_text = self._catch_utf8(next_token, new_text) diff --git a/exllamav2/linear.py b/exllamav2/linear.py index e6242e2..c6ab7b3 100644 --- a/exllamav2/linear.py +++ b/exllamav2/linear.py @@ -44,6 +44,10 @@ class ExLlamaV2Linear(ExLlamaV2Module): if w is None: w = self.load_weight() if isinstance(w, dict): + if self.has_bias: + assert "bias" in w, self.key + " has no bias but bias expected" + else: + assert "bias" not in w, self.key + " has bias but bias is not expected" device_tensors = self.model.get_device_tensors(self.device_idx) device_tensors.begin_scratch_alloc() self.temp_dq = device_tensors.get_scratch_slice(self.temp_dq_size()) @@ -51,10 +55,22 @@ class ExLlamaV2Linear(ExLlamaV2Module): self.q_handle = ext.make_q_matrix(w, self.temp_dq) elif isinstance(w, nn.Parameter): + assert not self.has_bias, self.key + " has no bias tensor but bias is expected" if self.padding > 0: w = nn.Parameter(F.pad(w.data, (0, 0, 0, self.padding)).contiguous()) self.linear = nn.Linear(self.in_features, self.out_features, self.has_bias, device = "meta", dtype = torch.float16) self.linear.weight = w + elif isinstance(w, tuple): + assert self.has_bias, self.key + " has bias tensor but bias is not expected" + ww = w[0] + wb = w[1] + if self.padding > 0: + ww = nn.Parameter(F.pad(ww.data, (0, 0, 0, self.padding)).contiguous()) + wb = nn.Parameter(F.pad(wb.data, (0, 0, 0, self.padding)).contiguous()) + self.linear = nn.Linear(self.in_features, self.out_features, self.has_bias, device = "meta", dtype = torch.float16) + self.linear.weight = ww + self.linear.bias = wb + def matrix_shape(self): @@ -129,6 +145,10 @@ class ExLlamaV2Linear(ExLlamaV2Module): matrix = self.get_weight_tensor_dq() hidden_states_out = torch.matmul(hidden_states, matrix) + if self.has_bias: + bias = self.get_bias_tensor() + hidden_states_out += bias + # Evaluate LoRAs if loras is not None: @@ -198,6 +218,17 @@ class ExLlamaV2Linear(ExLlamaV2Module): raise ValueError(f"Layer {self.key} has no data") + def get_bias_tensor(self): + + if self.linear is not None: + return self.linear.bias.data + + elif self.q_handle is not None: + return self.q_tensors["bias"] + + else: + raise ValueError(f"Layer {self.key} has no data") + def is_quant(self): return self.q_handle is not None diff --git a/exllamav2/model.py b/exllamav2/model.py index d13b4a8..5c07a79 100644 --- a/exllamav2/model.py +++ b/exllamav2/model.py @@ -9,8 +9,6 @@ if sys.version_info < min_version: # Set CUDA context to lazy loading since we won't need 95% of the modules in Torch os.environ["CUDA_MODULE_LOADING"] = "LAZY" -# Disabled for 0.0.13.post2 -# # # Set cudaMallocAsync allocator by default as it appears slightly more memory efficient, unless Torch is already # # imported in which case changing the allocator would cause it to crash # if not "PYTORCH_CUDA_ALLOC_CONF" in os.environ: diff --git a/exllamav2/module.py b/exllamav2/module.py index b84cf23..64a5780 100644 --- a/exllamav2/module.py +++ b/exllamav2/module.py @@ -93,14 +93,14 @@ class ExLlamaV2Module: # EXL2 if key + ".q_weight" in self.model.config.tensor_file_map: - qtensors = self.load_multi(["q_weight", "q_invperm", "q_scale", "q_scale_max", "q_groups", "q_perm"], override_key = override_key) + qtensors = self.load_multi(["q_weight", "q_invperm", "q_scale", "q_scale_max", "q_groups", "q_perm", "bias"], override_key = override_key) qtensors["q_perm"] = torch.argsort(qtensors["q_invperm"]).to(torch.int) return qtensors # GPTQ if key + ".qweight" in self.model.config.tensor_file_map: - qtensors = self.load_multi(["qweight", "qzeros", "scales", "g_idx"], override_key = override_key) + qtensors = self.load_multi(["qweight", "qzeros", "scales", "g_idx", "bias"], override_key = override_key) qtensors["scales"] = qtensors["scales"].half() return qtensors diff --git a/setup.py b/setup.py index 6d60d1c..cb32245 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ setup_kwargs = { "exllamav2/exllamav2_ext/ext_rope.cpp", "exllamav2/exllamav2_ext/ext_safetensors.cpp", "exllamav2/exllamav2_ext/ext_sampling.cpp", + "exllamav2/exllamav2_ext/cuda/h_add.cu", "exllamav2/exllamav2_ext/cuda/h_gemm.cu", "exllamav2/exllamav2_ext/cuda/lora.cu", "exllamav2/exllamav2_ext/cuda/pack_tensor.cu",