CLI - Specify GGML_TYPE to quantize for the main tensors. (#91)

To complement the token_embd.weight and output.weight :

attn_v.weight
attn_k.weight.
attn_q_weight
attn_output.weight
attn_qkv.weight
ffn_gate
ffn_down
ffn_up
This commit is contained in:
Nexes the Elder
2024-10-18 09:48:15 +02:00
committed by GitHub
parent 76b97c8064
commit 03cabe1540
3 changed files with 125 additions and 13 deletions

View File

@@ -361,6 +361,14 @@ extern "C" {
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
enum ggml_type attn_q_type; // attention query tensor type
enum ggml_type attn_k_type; // attention key tensor type
enum ggml_type attn_v_type; // attention value tensor type
enum ggml_type attn_qkv_type; // attention query-key-value tensor type
enum ggml_type attn_output_type; // attention output tensor type
enum ggml_type ffn_gate_type; // feedforward network gate type
enum ggml_type ffn_down_type; // feedforward network down type
enum ggml_type ffn_up_type; // feedforward network up type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored