CLI - Specify GGML_TYPE to quantize for the main tensors. (#91)

To complement the token_embd.weight and output.weight : attn_v.weight attn_k.weight. attn_q_weight attn_output.weight attn_qkv.weight ffn_gate ffn_down ffn_up
2026-04-26 01:19:20 +00:00 · 2024-10-18 09:48:15 +02:00
parent f369c6f921
commit 2b1af6bade
3 changed files with 125 additions and 13 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -361,6 +361,14 @@ extern "C" {
        enum llama_ftype ftype;              // quantize to this llama_ftype
        enum ggml_type output_tensor_type;   // output tensor type
        enum ggml_type token_embedding_type; // token embeddings tensor type
+        enum ggml_type attn_q_type;          // attention query tensor type
+        enum ggml_type attn_k_type;          // attention key tensor type
+        enum ggml_type attn_v_type;          // attention value tensor type
+        enum ggml_type attn_qkv_type;        // attention query-key-value tensor type
+        enum ggml_type attn_output_type;     // attention output tensor type
+        enum ggml_type ffn_gate_type;        // feedforward network gate type
+        enum ggml_type ffn_down_type;        // feedforward network down type
+        enum ggml_type ffn_up_type;          // feedforward network up type
        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor;         // quantize output.weight
        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored