Files
ik_llama.cpp/github-data/pull_requests/262 - Fix _261.md
2025-07-23 13:31:53 +02:00

3122 lines
298 KiB
Markdown
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

### 🐛 [#262](https://github.com/ikawrakow/ik_llama.cpp/pull/262) - Fix [#261](https://github.com/ikawrakow/ik_llama.cpp/issues/261)
| **Author** | `ikawrakow` |
| :--- | :--- |
| **State** | ❌ **Closed** |
| **Created** | 2025-03-18 |
| **Updated** | 2025-03-18 |
---
#### 💬 Conversation
👤 **davidsyoung** commented the **2025-03-18** at **10:41:29**:<br>
Unfortunately still getting NaNs under perplexity. I built the latest PR in regards q8_0 KV cache.
<details>
<summary>Quant command</summary>
```
./build/bin/llama-quantize --imatrix /models/deepseek-config/imatrix.dat \
--token-embedding-type q8_0 \
--attn-q-type q6_K \
--attn-k-type q6_K \
--attn-v-type q6_K \
--attn-qkv-type q6_K \
--attn-output-type q6_K \
--ffn-gate-type q6_K \
--ffn-down-type q6_K \
--ffn-up-type q6_K \
--custom-q "\.ffn_.*_shexp\.weight=q6_K,output\.weight=q6_K" \
--custom-q "blk\.3\.ffn_down_exps\.weight=q5_K,blk\.4\.ffn_down_exps\.weight=q5_K,blk\.5\.ffn_down_exps\.weight=q5_K,blk\.3\.ffn_up_exps\.weight=iq4_k,blk\.3\.ffn_gate_exps\.weight=iq4_k,blk\.4\.ffn_up_exps\.weight=iq4_k,blk\.4\.ffn_gate_exps\.weight=iq4_k,blk\.5\.ffn_up_exps\.weight=iq4_k,blk\.5\.ffn_gate_exps\.weight=iq4_k" \
--custom-q "blk\.6\.ffn_down_exps\.weight=q5_K,blk\.7\.ffn_down_exps\.weight=q5_K,blk\.8\.ffn_down_exps\.weight=q5_K,blk\.6\.ffn_up_exps\.weight=iq4_k,blk\.6\.ffn_gate_exps\.weight=iq4_k,blk\.7\.ffn_up_exps\.weight=iq4_k,blk\.7\.ffn_gate_exps\.weight=iq4_k,blk\.8\.ffn_up_exps\.weight=iq4_k,blk\.8\.ffn_gate_exps\.weight=iq4_k" \
--custom-q "blk\.9\.ffn_down_exps\.weight=iq4_k,blk\.10\.ffn_down_exps\.weight=iq4_k,blk\.11\.ffn_down_exps\.weight=iq4_k,blk\.12\.ffn_down_exps\.weight=iq4_k,blk\.9\.ffn_up_exps\.weight=iq3_k,blk\.9\.ffn_gate_exps\.weight=iq3_k,blk\.10\.ffn_up_exps\.weight=iq3_k,blk\.10\.ffn_gate_exps\.weight=iq3_k,blk\.11\.ffn_up_exps\.weight=iq3_k,blk\.11\.ffn_gate_exps\.weight=iq3_k,blk\.12\.ffn_up_exps\.weight=iq3_k,blk\.12\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.13\.ffn_down_exps\.weight=iq4_k,blk\.14\.ffn_down_exps\.weight=iq4_k,blk\.15\.ffn_down_exps\.weight=iq4_k,blk\.16\.ffn_down_exps\.weight=iq4_k,blk\.13\.ffn_up_exps\.weight=iq3_k,blk\.13\.ffn_gate_exps\.weight=iq3_k,blk\.14\.ffn_up_exps\.weight=iq3_k,blk\.14\.ffn_gate_exps\.weight=iq3_k,blk\.15\.ffn_up_exps\.weight=iq3_k,blk\.15\.ffn_gate_exps\.weight=iq3_k,blk\.16\.ffn_up_exps\.weight=iq3_k,blk\.16\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.17\.ffn_down_exps\.weight=iq4_k,blk\.18\.ffn_down_exps\.weight=iq4_k,blk\.19\.ffn_down_exps\.weight=iq4_k,blk\.20\.ffn_down_exps\.weight=iq4_k,blk\.17\.ffn_up_exps\.weight=iq3_k,blk\.17\.ffn_gate_exps\.weight=iq3_k,blk\.18\.ffn_up_exps\.weight=iq3_k,blk\.18\.ffn_gate_exps\.weight=iq3_k,blk\.19\.ffn_up_exps\.weight=iq3_k,blk\.19\.ffn_gate_exps\.weight=iq3_k,blk\.20\.ffn_up_exps\.weight=iq3_k,blk\.20\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.21\.ffn_down_exps\.weight=iq4_k,blk\.22\.ffn_down_exps\.weight=iq4_k,blk\.23\.ffn_down_exps\.weight=iq4_k,blk\.24\.ffn_down_exps\.weight=iq4_k,blk\.21\.ffn_up_exps\.weight=iq3_k,blk\.21\.ffn_gate_exps\.weight=iq3_k,blk\.22\.ffn_up_exps\.weight=iq3_k,blk\.22\.ffn_gate_exps\.weight=iq3_k,blk\.23\.ffn_up_exps\.weight=iq3_k,blk\.23\.ffn_gate_exps\.weight=iq3_k,blk\.24\.ffn_up_exps\.weight=iq3_k,blk\.24\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.25\.ffn_down_exps\.weight=iq4_k,blk\.26\.ffn_down_exps\.weight=iq4_k,blk\.27\.ffn_down_exps\.weight=iq4_k,blk\.28\.ffn_down_exps\.weight=iq4_k,blk\.25\.ffn_up_exps\.weight=iq3_k,blk\.25\.ffn_gate_exps\.weight=iq3_k,blk\.26\.ffn_up_exps\.weight=iq3_k,blk\.26\.ffn_gate_exps\.weight=iq3_k,blk\.27\.ffn_up_exps\.weight=iq3_k,blk\.27\.ffn_gate_exps\.weight=iq3_k,blk\.28\.ffn_up_exps\.weight=iq3_k,blk\.28\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.29\.ffn_down_exps\.weight=iq4_k,blk\.30\.ffn_down_exps\.weight=iq4_k,blk\.31\.ffn_down_exps\.weight=iq4_k,blk\.32\.ffn_down_exps\.weight=iq4_k,blk\.29\.ffn_up_exps\.weight=iq3_k,blk\.29\.ffn_gate_exps\.weight=iq3_k,blk\.30\.ffn_up_exps\.weight=iq3_k,blk\.30\.ffn_gate_exps\.weight=iq3_k,blk\.31\.ffn_up_exps\.weight=iq3_k,blk\.31\.ffn_gate_exps\.weight=iq3_k,blk\.32\.ffn_up_exps\.weight=iq3_k,blk\.32\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.33\.ffn_down_exps\.weight=iq4_k,blk\.34\.ffn_down_exps\.weight=iq4_k,blk\.35\.ffn_down_exps\.weight=iq4_k,blk\.36\.ffn_down_exps\.weight=iq4_k,blk\.33\.ffn_up_exps\.weight=iq3_k,blk\.33\.ffn_gate_exps\.weight=iq3_k,blk\.34\.ffn_up_exps\.weight=iq3_k,blk\.34\.ffn_gate_exps\.weight=iq3_k,blk\.35\.ffn_up_exps\.weight=iq3_k,blk\.35\.ffn_gate_exps\.weight=iq3_k,blk\.36\.ffn_up_exps\.weight=iq3_k,blk\.36\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.37\.ffn_down_exps\.weight=iq4_k,blk\.38\.ffn_down_exps\.weight=iq4_k,blk\.39\.ffn_down_exps\.weight=iq4_k,blk\.40\.ffn_down_exps\.weight=iq4_k,blk\.37\.ffn_up_exps\.weight=iq3_k,blk\.37\.ffn_gate_exps\.weight=iq3_k,blk\.38\.ffn_up_exps\.weight=iq3_k,blk\.38\.ffn_gate_exps\.weight=iq3_k,blk\.39\.ffn_up_exps\.weight=iq3_k,blk\.39\.ffn_gate_exps\.weight=iq3_k,blk\.40\.ffn_up_exps\.weight=iq3_k,blk\.40\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.41\.ffn_down_exps\.weight=iq4_k,blk\.42\.ffn_down_exps\.weight=iq4_k,blk\.43\.ffn_down_exps\.weight=iq4_k,blk\.44\.ffn_down_exps\.weight=iq4_k,blk\.41\.ffn_up_exps\.weight=iq3_k,blk\.41\.ffn_gate_exps\.weight=iq3_k,blk\.42\.ffn_up_exps\.weight=iq3_k,blk\.42\.ffn_gate_exps\.weight=iq3_k,blk\.43\.ffn_up_exps\.weight=iq3_k,blk\.43\.ffn_gate_exps\.weight=iq3_k,blk\.44\.ffn_up_exps\.weight=iq3_k,blk\.44\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.45\.ffn_down_exps\.weight=iq4_k,blk\.46\.ffn_down_exps\.weight=iq4_k,blk\.47\.ffn_down_exps\.weight=iq4_k,blk\.48\.ffn_down_exps\.weight=iq4_k,blk\.45\.ffn_up_exps\.weight=iq3_k,blk\.45\.ffn_gate_exps\.weight=iq3_k,blk\.46\.ffn_up_exps\.weight=iq3_k,blk\.46\.ffn_gate_exps\.weight=iq3_k,blk\.47\.ffn_up_exps\.weight=iq3_k,blk\.47\.ffn_gate_exps\.weight=iq3_k,blk\.48\.ffn_up_exps\.weight=iq3_k,blk\.48\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.49\.ffn_down_exps\.weight=iq4_k,blk\.50\.ffn_down_exps\.weight=iq4_k,blk\.51\.ffn_down_exps\.weight=iq4_k,blk\.52\.ffn_down_exps\.weight=iq4_k,blk\.49\.ffn_up_exps\.weight=iq3_k,blk\.49\.ffn_gate_exps\.weight=iq3_k,blk\.50\.ffn_up_exps\.weight=iq3_k,blk\.50\.ffn_gate_exps\.weight=iq3_k,blk\.51\.ffn_up_exps\.weight=iq3_k,blk\.51\.ffn_gate_exps\.weight=iq3_k,blk\.52\.ffn_up_exps\.weight=iq3_k,blk\.52\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.53\.ffn_down_exps\.weight=iq4_k,blk\.54\.ffn_down_exps\.weight=iq4_k,blk\.55\.ffn_down_exps\.weight=iq4_k,blk\.56\.ffn_down_exps\.weight=iq4_k,blk\.53\.ffn_up_exps\.weight=iq3_k,blk\.53\.ffn_gate_exps\.weight=iq3_k,blk\.54\.ffn_up_exps\.weight=iq3_k,blk\.54\.ffn_gate_exps\.weight=iq3_k,blk\.55\.ffn_up_exps\.weight=iq3_k,blk\.55\.ffn_gate_exps\.weight=iq3_k,blk\.56\.ffn_up_exps\.weight=iq3_k,blk\.56\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.57\.ffn_down_exps\.weight=iq4_k,blk\.58\.ffn_down_exps\.weight=iq4_k,blk\.59\.ffn_down_exps\.weight=iq4_k,blk\.60\.ffn_down_exps\.weight=iq4_k,blk\.57\.ffn_up_exps\.weight=iq3_k,blk\.57\.ffn_gate_exps\.weight=iq3_k,blk\.58\.ffn_up_exps\.weight=iq3_k,blk\.58\.ffn_gate_exps\.weight=iq3_k,blk\.59\.ffn_up_exps\.weight=iq3_k,blk\.59\.ffn_gate_exps\.weight=iq3_k,blk\.60\.ffn_up_exps\.weight=iq3_k,blk\.60\.ffn_gate_exps\.weight=iq3_k" \
/storage/DeepSeek-R1-GGUF/unsloth_DeepSeek-R1-BF16-256x21B-F16-00001-of-00059.gguf \
/models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf \
q6_K 6
```
</details>
<details>
<summary>Quant command output</summary>
```
Adding custom rule \.ffn_.*_shexp\.weight -> q6_K
Adding custom rule output\.weight -> q6_K
Adding custom rule blk\.3\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.4\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.5\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.3\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.3\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.4\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.4\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.5\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.5\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.6\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.7\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.8\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.6\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.6\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.7\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.7\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.8\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.8\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.9\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.10\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.11\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.12\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.9\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.9\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.10\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.10\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.11\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.11\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.12\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.12\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.13\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.14\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.15\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.16\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.13\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.13\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.14\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.14\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.15\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.15\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.16\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.16\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.17\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.18\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.19\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.20\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.17\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.17\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.18\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.18\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.19\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.19\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.20\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.20\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.21\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.22\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.23\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.24\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.21\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.21\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.22\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.22\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.23\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.23\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.24\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.24\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.25\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.26\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.27\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.28\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.25\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.25\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.26\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.26\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.27\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.27\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.28\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.28\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.29\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.30\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.31\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.32\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.29\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.29\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.30\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.30\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.31\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.31\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.32\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.32\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.33\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.34\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.35\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.36\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.33\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.33\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.34\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.34\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.35\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.35\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.36\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.36\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.37\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.38\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.39\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.40\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.37\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.37\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.38\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.38\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.39\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.39\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.40\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.40\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.41\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.42\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.43\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.44\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.41\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.41\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.42\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.42\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.43\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.43\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.44\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.44\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.45\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.46\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.47\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.48\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.45\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.45\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.46\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.46\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.47\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.47\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.48\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.48\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.49\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.50\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.51\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.52\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.49\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.49\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.50\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.50\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.51\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.51\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.52\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.52\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.53\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.54\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.55\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.56\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.53\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.53\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.54\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.54\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.55\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.55\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.56\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.56\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.57\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.58\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.59\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.60\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.57\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.57\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.58\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.58\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.59\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.59\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.60\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.60\.ffn_gate_exps\.weight -> iq3_k
load_imatrix: imatrix dataset='imatrix-training-full-3'
load_imatrix: loaded 720 importance matrix entries from /models/deepseek-config/imatrix.dat computed on 315 chunks
prepare_imatrix: have 720 importance matrix entries
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '/storage/DeepSeek-R1-GGUF/unsloth_DeepSeek-R1-BF16-256x21B-F16-00001-of-00059.gguf' to '/models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf' as Q6_K using 64 threads
llama_model_loader: additional 58 GGUFs metadata loaded.
llama_model_loader: loaded meta data with 53 key-value pairs and 1147 tensors from /storage/DeepSeek-R1-GGUF/unsloth_DeepSeek-R1-BF16-256x21B-F16-00001-of-00059.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = deepseek2
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = unsloth_DeepSeek R1 BF16
llama_model_loader: - kv 3: general.size_label str = 256x21B
llama_model_loader: - kv 4: general.license str = mit
llama_model_loader: - kv 5: general.base_model.count u32 = 1
llama_model_loader: - kv 6: general.base_model.0.name str = DeepSeek R1
llama_model_loader: - kv 7: general.base_model.0.organization str = Deepseek Ai
llama_model_loader: - kv 8: general.base_model.0.repo_url str = https://huggingface.co/deepseek-ai/De...
llama_model_loader: - kv 9: general.tags arr[str,3] = ["deepseek", "unsloth", "transformers"]
llama_model_loader: - kv 10: general.languages arr[str,1] = ["en"]
llama_model_loader: - kv 11: deepseek2.block_count u32 = 61
llama_model_loader: - kv 12: deepseek2.context_length u32 = 163840
llama_model_loader: - kv 13: deepseek2.embedding_length u32 = 7168
llama_model_loader: - kv 14: deepseek2.feed_forward_length u32 = 18432
llama_model_loader: - kv 15: deepseek2.attention.head_count u32 = 128
llama_model_loader: - kv 16: deepseek2.attention.head_count_kv u32 = 128
llama_model_loader: - kv 17: deepseek2.rope.freq_base f32 = 10000.000000
llama_model_loader: - kv 18: deepseek2.attention.layer_norm_rms_epsilon f32 = 0.000001
llama_model_loader: - kv 19: deepseek2.expert_used_count u32 = 8
llama_model_loader: - kv 20: general.file_type u32 = 1
llama_model_loader: - kv 21: deepseek2.leading_dense_block_count u32 = 3
llama_model_loader: - kv 22: deepseek2.vocab_size u32 = 129280
llama_model_loader: - kv 23: deepseek2.attention.q_lora_rank u32 = 1536
llama_model_loader: - kv 24: deepseek2.attention.kv_lora_rank u32 = 512
llama_model_loader: - kv 25: deepseek2.attention.key_length u32 = 192
llama_model_loader: - kv 26: deepseek2.attention.value_length u32 = 128
llama_model_loader: - kv 27: deepseek2.expert_feed_forward_length u32 = 2048
llama_model_loader: - kv 28: deepseek2.expert_count u32 = 256
llama_model_loader: - kv 29: deepseek2.expert_shared_count u32 = 1
llama_model_loader: - kv 30: deepseek2.expert_weights_scale f32 = 2.500000
llama_model_loader: - kv 31: deepseek2.expert_weights_norm bool = true
llama_model_loader: - kv 32: deepseek2.expert_gating_func u32 = 2
llama_model_loader: - kv 33: deepseek2.rope.dimension_count u32 = 64
llama_model_loader: - kv 34: deepseek2.rope.scaling.type str = yarn
llama_model_loader: - kv 35: deepseek2.rope.scaling.factor f32 = 40.000000
llama_model_loader: - kv 36: deepseek2.rope.scaling.original_context_length u32 = 4096
llama_model_loader: - kv 37: deepseek2.rope.scaling.yarn_log_multiplier f32 = 0.100000
llama_model_loader: - kv 38: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 39: tokenizer.ggml.pre str = deepseek-v3
llama_model_loader: - kv 40: tokenizer.ggml.tokens arr[str,129280] = ["<begin▁of▁sentence>", "<<3C>...
llama_model_loader: - kv 41: tokenizer.ggml.token_type arr[i32,129280] = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 42: tokenizer.ggml.merges arr[str,127741] = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv 43: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 44: tokenizer.ggml.eos_token_id u32 = 1
llama_model_loader: - kv 45: tokenizer.ggml.padding_token_id u32 = 128815
llama_model_loader: - kv 46: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 47: tokenizer.ggml.add_eos_token bool = false
llama_model_loader: - kv 48: tokenizer.chat_template str = {% if not add_generation_prompt is de...
llama_model_loader: - kv 49: general.quantization_version u32 = 2
llama_model_loader: - kv 50: split.no u16 = 0
llama_model_loader: - kv 51: split.count u16 = 59
llama_model_loader: - kv 52: split.tensors.count i32 = 1147
llama_model_loader: - type f32: 361 tensors
llama_model_loader: - type f16: 786 tensors
================================ Have weights data with 720 entries
[ 1/1147] token_embd.weight - [ 7168, 129280, 1, 1], type = f16,
====== llama_model_quantize_internal: did not find weights for token_embd.weight
converting to q8_0 .. size = 1767.50 MiB -> 938.98 MiB
[ 2/1147] blk.0.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 3/1147] blk.0.ffn_down.weight - [18432, 7168, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 4/1147] blk.0.ffn_gate.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 5/1147] blk.0.ffn_up.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 6/1147] blk.0.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 7/1147] blk.0.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 8/1147] blk.0.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 9/1147] blk.0.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 10/1147] blk.0.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.0.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 11/1147] blk.0.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 12/1147] blk.0.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.0.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 13/1147] blk.0.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 14/1147] blk.0.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 15/1147] blk.0.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 16/1147] blk.1.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 17/1147] blk.1.ffn_down.weight - [18432, 7168, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 18/1147] blk.1.ffn_gate.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 19/1147] blk.1.ffn_up.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 20/1147] blk.1.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 21/1147] blk.1.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 22/1147] blk.1.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 23/1147] blk.1.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 24/1147] blk.1.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.1.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 25/1147] blk.1.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 26/1147] blk.1.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.1.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 27/1147] blk.1.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 28/1147] blk.1.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 29/1147] blk.1.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 30/1147] blk.2.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 31/1147] blk.2.ffn_down.weight - [18432, 7168, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 32/1147] blk.2.ffn_gate.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 33/1147] blk.2.ffn_up.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 34/1147] blk.2.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 35/1147] blk.2.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 36/1147] blk.2.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 37/1147] blk.2.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 38/1147] blk.2.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.2.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 39/1147] blk.2.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 40/1147] blk.2.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.2.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 41/1147] blk.2.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 42/1147] blk.2.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 43/1147] blk.2.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 44/1147] blk.3.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 45/1147] blk.3.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 46/1147] blk.3.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.3.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 47/1147] blk.3.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.3.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 48/1147] blk.3.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.3.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 49/1147] blk.3.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 50/1147] blk.3.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 51/1147] blk.3.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 52/1147] blk.3.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.3.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 53/1147] blk.3.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 54/1147] blk.3.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.3.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 55/1147] blk.3.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 56/1147] blk.3.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 57/1147] blk.3.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 58/1147] blk.3.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 59/1147] blk.3.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.3.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 60/1147] blk.3.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.3.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 61/1147] blk.3.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.3.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 62/1147] blk.3.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 63/1147] blk.4.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 64/1147] blk.4.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 65/1147] blk.4.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.4.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 66/1147] blk.4.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.4.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 67/1147] blk.4.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.4.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 68/1147] blk.4.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 69/1147] blk.4.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 70/1147] blk.4.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 71/1147] blk.4.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.4.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 72/1147] blk.4.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 73/1147] blk.4.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.4.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 74/1147] blk.4.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 75/1147] blk.4.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 76/1147] blk.4.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 77/1147] blk.4.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 78/1147] blk.4.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.4.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 79/1147] blk.4.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.4.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 80/1147] blk.4.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.4.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 81/1147] blk.4.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 82/1147] blk.5.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 83/1147] blk.5.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 84/1147] blk.5.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 85/1147] blk.5.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.5.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 86/1147] blk.5.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 87/1147] blk.5.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.5.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 88/1147] blk.5.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 89/1147] blk.5.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 90/1147] blk.5.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 91/1147] blk.5.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 92/1147] blk.5.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 93/1147] blk.5.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.5.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 94/1147] blk.5.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.5.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 95/1147] blk.5.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.5.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 96/1147] blk.5.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 97/1147] blk.5.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.5.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 98/1147] blk.5.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.5.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 99/1147] blk.5.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.5.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 100/1147] blk.5.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 101/1147] blk.6.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 102/1147] blk.6.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 103/1147] blk.6.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.6.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 104/1147] blk.6.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.6.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 105/1147] blk.6.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.6.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 106/1147] blk.6.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 107/1147] blk.6.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 108/1147] blk.6.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 109/1147] blk.6.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.6.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 110/1147] blk.6.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 111/1147] blk.6.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.6.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 112/1147] blk.6.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 113/1147] blk.6.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 114/1147] blk.6.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 115/1147] blk.6.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 116/1147] blk.6.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.6.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 117/1147] blk.6.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.6.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 118/1147] blk.6.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.6.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 119/1147] blk.6.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 120/1147] blk.7.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 121/1147] blk.7.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 122/1147] blk.7.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.7.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 123/1147] blk.7.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.7.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 124/1147] blk.7.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.7.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 125/1147] blk.7.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 126/1147] blk.7.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 127/1147] blk.7.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 128/1147] blk.7.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.7.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 129/1147] blk.7.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 130/1147] blk.7.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.7.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 131/1147] blk.7.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 132/1147] blk.7.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 133/1147] blk.7.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 134/1147] blk.7.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 135/1147] blk.7.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.7.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 136/1147] blk.7.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.7.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 137/1147] blk.7.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.7.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 138/1147] blk.7.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 139/1147] blk.8.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 140/1147] blk.8.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 141/1147] blk.8.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.8.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 142/1147] blk.8.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.8.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 143/1147] blk.8.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.8.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 144/1147] blk.8.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 145/1147] blk.8.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 146/1147] blk.8.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 147/1147] blk.8.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.8.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 148/1147] blk.8.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 149/1147] blk.8.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.8.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 150/1147] blk.8.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 151/1147] blk.8.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 152/1147] blk.8.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 153/1147] blk.8.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 154/1147] blk.8.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.8.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 155/1147] blk.8.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.8.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 156/1147] blk.8.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.8.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 157/1147] blk.8.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 158/1147] blk.9.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 159/1147] blk.9.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 160/1147] blk.9.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.9.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 161/1147] blk.9.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.9.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 162/1147] blk.9.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.9.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 163/1147] blk.9.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 164/1147] blk.9.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 165/1147] blk.9.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 166/1147] blk.9.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.9.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 167/1147] blk.9.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 168/1147] blk.9.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.9.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 169/1147] blk.9.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 170/1147] blk.9.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 171/1147] blk.9.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 172/1147] blk.10.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 173/1147] blk.10.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 174/1147] blk.10.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.10.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 175/1147] blk.10.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.10.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 176/1147] blk.10.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.10.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 177/1147] blk.10.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 178/1147] blk.10.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 179/1147] blk.10.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 180/1147] blk.10.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.10.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 181/1147] blk.10.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 182/1147] blk.10.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.10.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 183/1147] blk.10.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 184/1147] blk.10.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 185/1147] blk.10.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 186/1147] blk.9.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 187/1147] blk.9.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.9.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 188/1147] blk.9.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.9.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 189/1147] blk.9.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.9.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 190/1147] blk.9.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 191/1147] blk.10.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 192/1147] blk.10.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.10.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 193/1147] blk.10.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.10.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 194/1147] blk.10.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.10.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 195/1147] blk.10.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 196/1147] blk.11.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 197/1147] blk.11.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 198/1147] blk.11.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.11.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 199/1147] blk.11.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.11.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 200/1147] blk.11.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.11.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 201/1147] blk.11.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 202/1147] blk.11.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 203/1147] blk.11.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 204/1147] blk.11.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.11.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 205/1147] blk.11.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 206/1147] blk.11.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.11.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 207/1147] blk.11.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 208/1147] blk.11.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 209/1147] blk.11.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 210/1147] blk.11.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 211/1147] blk.11.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.11.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 212/1147] blk.11.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.11.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 213/1147] blk.11.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.11.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 214/1147] blk.11.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 215/1147] blk.12.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 216/1147] blk.12.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 217/1147] blk.12.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.12.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 218/1147] blk.12.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.12.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 219/1147] blk.12.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.12.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 220/1147] blk.12.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 221/1147] blk.12.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 222/1147] blk.12.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 223/1147] blk.12.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.12.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 224/1147] blk.12.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 225/1147] blk.12.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.12.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 226/1147] blk.12.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 227/1147] blk.12.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 228/1147] blk.12.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 229/1147] blk.12.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 230/1147] blk.12.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.12.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 231/1147] blk.12.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.12.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 232/1147] blk.12.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.12.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 233/1147] blk.12.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 234/1147] blk.13.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 235/1147] blk.13.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 236/1147] blk.13.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.13.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 237/1147] blk.13.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.13.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 238/1147] blk.13.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.13.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 239/1147] blk.13.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 240/1147] blk.13.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 241/1147] blk.13.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 242/1147] blk.13.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.13.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 243/1147] blk.13.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 244/1147] blk.13.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.13.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 245/1147] blk.13.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 246/1147] blk.13.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 247/1147] blk.13.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 248/1147] blk.13.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 249/1147] blk.13.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.13.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 250/1147] blk.13.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.13.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 251/1147] blk.13.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.13.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 252/1147] blk.13.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 253/1147] blk.14.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 254/1147] blk.14.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 255/1147] blk.14.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.14.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 256/1147] blk.14.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.14.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 257/1147] blk.14.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.14.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 258/1147] blk.14.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 259/1147] blk.14.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 260/1147] blk.14.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 261/1147] blk.14.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.14.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 262/1147] blk.14.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 263/1147] blk.14.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.14.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 264/1147] blk.14.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 265/1147] blk.14.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 266/1147] blk.14.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 267/1147] blk.14.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 268/1147] blk.14.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.14.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 269/1147] blk.14.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.14.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 270/1147] blk.14.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.14.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 271/1147] blk.14.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 272/1147] blk.15.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 273/1147] blk.15.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 274/1147] blk.15.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.15.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 275/1147] blk.15.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.15.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 276/1147] blk.15.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.15.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 277/1147] blk.15.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 278/1147] blk.15.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 279/1147] blk.15.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 280/1147] blk.15.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.15.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 281/1147] blk.15.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 282/1147] blk.15.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.15.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 283/1147] blk.15.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 284/1147] blk.15.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 285/1147] blk.15.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 286/1147] blk.15.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 287/1147] blk.15.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.15.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 288/1147] blk.15.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.15.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 289/1147] blk.15.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.15.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 290/1147] blk.15.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 291/1147] blk.16.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 292/1147] blk.16.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 293/1147] blk.16.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.16.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 294/1147] blk.16.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.16.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 295/1147] blk.16.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.16.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 296/1147] blk.16.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 297/1147] blk.16.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 298/1147] blk.16.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 299/1147] blk.16.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.16.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 300/1147] blk.16.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 301/1147] blk.16.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.16.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 302/1147] blk.16.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 303/1147] blk.16.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 304/1147] blk.16.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 305/1147] blk.16.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 306/1147] blk.16.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.16.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 307/1147] blk.16.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.16.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 308/1147] blk.16.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.16.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 309/1147] blk.16.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 310/1147] blk.17.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 311/1147] blk.17.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 312/1147] blk.17.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.17.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 313/1147] blk.17.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.17.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 314/1147] blk.17.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.17.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 315/1147] blk.17.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 316/1147] blk.17.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 317/1147] blk.17.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 318/1147] blk.17.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.17.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 319/1147] blk.17.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 320/1147] blk.17.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.17.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 321/1147] blk.17.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 322/1147] blk.17.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 323/1147] blk.17.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 324/1147] blk.17.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 325/1147] blk.17.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.17.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 326/1147] blk.17.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.17.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 327/1147] blk.17.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.17.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 328/1147] blk.17.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 329/1147] blk.18.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 330/1147] blk.18.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 331/1147] blk.18.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.18.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 332/1147] blk.18.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.18.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 333/1147] blk.18.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.18.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 334/1147] blk.18.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 335/1147] blk.18.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 336/1147] blk.18.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 337/1147] blk.18.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.18.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 338/1147] blk.18.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 339/1147] blk.18.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.18.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 340/1147] blk.18.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 341/1147] blk.18.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 342/1147] blk.18.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 343/1147] blk.18.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 344/1147] blk.18.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.18.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 345/1147] blk.18.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.18.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 346/1147] blk.18.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.18.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 347/1147] blk.18.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 348/1147] blk.19.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 349/1147] blk.19.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 350/1147] blk.19.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.19.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 351/1147] blk.19.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.19.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 352/1147] blk.19.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.19.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 353/1147] blk.19.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 354/1147] blk.19.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 355/1147] blk.19.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 356/1147] blk.19.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.19.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 357/1147] blk.19.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 358/1147] blk.19.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.19.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 359/1147] blk.19.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 360/1147] blk.19.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 361/1147] blk.19.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 362/1147] blk.19.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 363/1147] blk.19.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.19.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 364/1147] blk.19.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.19.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 365/1147] blk.19.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.19.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 366/1147] blk.19.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 367/1147] blk.20.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 368/1147] blk.20.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 369/1147] blk.20.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.20.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 370/1147] blk.20.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.20.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 371/1147] blk.20.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.20.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 372/1147] blk.20.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 373/1147] blk.20.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 374/1147] blk.20.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 375/1147] blk.20.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.20.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 376/1147] blk.20.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 377/1147] blk.20.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.20.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 378/1147] blk.20.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 379/1147] blk.20.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 380/1147] blk.20.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 381/1147] blk.20.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 382/1147] blk.20.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.20.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 383/1147] blk.20.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.20.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 384/1147] blk.20.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.20.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 385/1147] blk.20.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 386/1147] blk.21.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 387/1147] blk.21.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 388/1147] blk.21.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.21.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 389/1147] blk.21.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.21.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 390/1147] blk.21.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.21.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 391/1147] blk.21.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 392/1147] blk.21.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 393/1147] blk.21.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 394/1147] blk.21.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.21.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 395/1147] blk.21.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 396/1147] blk.21.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.21.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 397/1147] blk.21.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 398/1147] blk.21.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 399/1147] blk.21.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 400/1147] blk.21.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 401/1147] blk.21.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.21.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 402/1147] blk.21.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.21.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 403/1147] blk.21.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.21.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 404/1147] blk.21.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 405/1147] blk.22.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 406/1147] blk.22.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 407/1147] blk.22.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.22.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 408/1147] blk.22.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.22.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 409/1147] blk.22.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.22.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 410/1147] blk.22.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 411/1147] blk.22.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 412/1147] blk.22.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 413/1147] blk.22.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.22.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 414/1147] blk.22.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 415/1147] blk.22.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.22.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 416/1147] blk.22.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 417/1147] blk.22.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 418/1147] blk.22.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 419/1147] blk.22.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 420/1147] blk.22.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.22.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 421/1147] blk.22.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.22.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 422/1147] blk.22.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.22.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 423/1147] blk.22.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 424/1147] blk.23.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 425/1147] blk.23.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 426/1147] blk.23.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.23.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 427/1147] blk.23.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.23.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 428/1147] blk.23.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.23.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 429/1147] blk.23.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 430/1147] blk.23.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 431/1147] blk.23.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 432/1147] blk.23.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.23.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 433/1147] blk.23.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 434/1147] blk.23.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.23.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 435/1147] blk.23.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 436/1147] blk.23.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 437/1147] blk.23.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 438/1147] blk.23.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 439/1147] blk.23.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.23.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 440/1147] blk.23.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.23.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 441/1147] blk.23.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.23.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 442/1147] blk.23.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 443/1147] blk.24.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 444/1147] blk.24.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 445/1147] blk.24.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.24.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 446/1147] blk.24.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.24.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 447/1147] blk.24.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.24.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 448/1147] blk.24.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 449/1147] blk.24.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 450/1147] blk.24.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 451/1147] blk.24.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.24.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 452/1147] blk.24.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 453/1147] blk.24.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.24.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 454/1147] blk.24.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 455/1147] blk.24.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 456/1147] blk.24.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 457/1147] blk.24.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 458/1147] blk.24.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.24.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 459/1147] blk.24.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.24.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 460/1147] blk.24.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.24.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 461/1147] blk.24.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 462/1147] blk.25.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 463/1147] blk.25.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 464/1147] blk.25.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.25.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 465/1147] blk.25.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.25.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 466/1147] blk.25.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.25.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 467/1147] blk.25.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 468/1147] blk.25.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 469/1147] blk.25.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 470/1147] blk.25.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.25.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 471/1147] blk.25.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 472/1147] blk.25.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.25.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 473/1147] blk.25.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 474/1147] blk.25.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 475/1147] blk.25.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 476/1147] blk.25.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 477/1147] blk.25.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.25.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 478/1147] blk.25.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.25.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 479/1147] blk.25.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.25.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 480/1147] blk.25.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 481/1147] blk.26.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 482/1147] blk.26.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 483/1147] blk.26.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.26.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 484/1147] blk.26.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.26.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 485/1147] blk.26.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.26.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 486/1147] blk.26.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 487/1147] blk.26.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 488/1147] blk.26.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 489/1147] blk.26.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.26.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 490/1147] blk.26.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 491/1147] blk.26.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.26.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 492/1147] blk.26.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 493/1147] blk.26.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 494/1147] blk.26.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 495/1147] blk.26.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 496/1147] blk.26.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.26.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 497/1147] blk.26.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.26.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 498/1147] blk.26.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.26.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 499/1147] blk.26.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 500/1147] blk.27.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 501/1147] blk.27.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 502/1147] blk.27.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.27.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 503/1147] blk.27.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.27.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 504/1147] blk.27.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.27.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 505/1147] blk.27.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 506/1147] blk.27.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 507/1147] blk.27.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 508/1147] blk.27.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.27.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 509/1147] blk.27.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 510/1147] blk.27.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.27.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 511/1147] blk.27.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 512/1147] blk.27.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 513/1147] blk.27.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 514/1147] blk.27.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 515/1147] blk.27.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.27.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 516/1147] blk.27.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.27.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 517/1147] blk.27.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.27.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 518/1147] blk.27.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 519/1147] blk.28.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 520/1147] blk.28.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 521/1147] blk.28.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.28.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 522/1147] blk.28.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.28.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 523/1147] blk.28.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.28.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 524/1147] blk.28.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 525/1147] blk.28.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 526/1147] blk.28.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 527/1147] blk.28.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.28.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 528/1147] blk.28.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 529/1147] blk.28.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.28.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 530/1147] blk.28.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 531/1147] blk.28.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 532/1147] blk.28.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 533/1147] blk.28.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 534/1147] blk.28.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.28.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 535/1147] blk.28.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.28.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 536/1147] blk.28.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.28.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 537/1147] blk.28.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 538/1147] blk.29.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 539/1147] blk.29.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 540/1147] blk.29.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.29.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 541/1147] blk.29.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.29.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 542/1147] blk.29.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.29.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 543/1147] blk.29.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 544/1147] blk.29.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 545/1147] blk.29.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 546/1147] blk.29.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.29.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 547/1147] blk.29.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 548/1147] blk.29.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.29.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 549/1147] blk.29.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 550/1147] blk.29.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 551/1147] blk.29.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 552/1147] blk.29.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 553/1147] blk.29.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.29.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 554/1147] blk.29.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.29.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 555/1147] blk.29.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.29.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 556/1147] blk.29.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 557/1147] blk.30.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 558/1147] blk.30.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 559/1147] blk.30.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.30.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 560/1147] blk.30.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.30.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 561/1147] blk.30.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.30.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 562/1147] blk.30.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 563/1147] blk.30.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 564/1147] blk.30.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 565/1147] blk.30.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.30.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 566/1147] blk.30.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 567/1147] blk.30.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.30.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 568/1147] blk.30.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 569/1147] blk.30.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 570/1147] blk.30.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 571/1147] blk.30.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 572/1147] blk.30.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.30.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 573/1147] blk.30.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.30.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 574/1147] blk.30.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.30.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 575/1147] blk.30.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 576/1147] blk.31.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 577/1147] blk.31.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 578/1147] blk.31.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.31.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 579/1147] blk.31.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.31.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 580/1147] blk.31.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.31.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 581/1147] blk.31.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 582/1147] blk.31.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 583/1147] blk.31.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 584/1147] blk.31.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.31.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 585/1147] blk.31.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 586/1147] blk.31.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.31.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 587/1147] blk.31.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 588/1147] blk.31.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 589/1147] blk.31.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 590/1147] blk.31.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 591/1147] blk.31.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.31.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 592/1147] blk.31.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.31.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 593/1147] blk.31.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.31.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 594/1147] blk.31.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 595/1147] blk.32.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 596/1147] blk.32.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 597/1147] blk.32.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.32.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 598/1147] blk.32.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.32.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 599/1147] blk.32.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.32.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 600/1147] blk.32.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 601/1147] blk.32.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 602/1147] blk.32.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 603/1147] blk.32.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.32.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 604/1147] blk.32.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 605/1147] blk.32.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.32.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 606/1147] blk.32.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 607/1147] blk.32.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 608/1147] blk.32.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 609/1147] blk.32.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 610/1147] blk.32.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.32.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 611/1147] blk.32.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.32.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 612/1147] blk.32.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.32.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 613/1147] blk.32.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 614/1147] blk.33.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 615/1147] blk.33.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 616/1147] blk.33.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.33.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 617/1147] blk.33.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.33.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 618/1147] blk.33.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.33.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 619/1147] blk.33.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 620/1147] blk.33.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 621/1147] blk.33.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 622/1147] blk.33.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.33.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 623/1147] blk.33.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 624/1147] blk.33.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.33.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 625/1147] blk.33.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 626/1147] blk.33.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 627/1147] blk.33.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 628/1147] blk.33.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 629/1147] blk.33.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.33.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 630/1147] blk.33.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.33.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 631/1147] blk.33.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.33.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 632/1147] blk.33.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 633/1147] blk.34.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 634/1147] blk.34.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 635/1147] blk.34.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.34.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 636/1147] blk.34.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.34.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 637/1147] blk.34.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.34.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 638/1147] blk.34.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 639/1147] blk.34.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 640/1147] blk.34.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 641/1147] blk.34.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.34.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 642/1147] blk.34.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 643/1147] blk.34.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.34.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 644/1147] blk.34.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 645/1147] blk.34.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 646/1147] blk.34.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 647/1147] blk.34.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 648/1147] blk.34.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.34.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 649/1147] blk.34.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.34.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 650/1147] blk.34.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.34.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 651/1147] blk.34.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 652/1147] blk.35.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 653/1147] blk.35.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 654/1147] blk.35.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.35.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 655/1147] blk.35.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.35.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 656/1147] blk.35.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.35.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 657/1147] blk.35.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 658/1147] blk.35.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 659/1147] blk.35.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 660/1147] blk.35.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.35.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 661/1147] blk.35.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 662/1147] blk.35.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.35.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 663/1147] blk.35.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 664/1147] blk.35.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 665/1147] blk.35.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 666/1147] blk.35.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 667/1147] blk.35.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.35.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 668/1147] blk.35.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.35.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 669/1147] blk.35.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.35.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 670/1147] blk.35.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 671/1147] blk.36.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 672/1147] blk.36.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 673/1147] blk.36.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.36.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 674/1147] blk.36.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.36.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 675/1147] blk.36.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.36.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 676/1147] blk.36.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 677/1147] blk.36.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 678/1147] blk.36.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 679/1147] blk.36.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.36.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 680/1147] blk.36.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 681/1147] blk.36.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.36.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 682/1147] blk.36.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 683/1147] blk.36.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 684/1147] blk.36.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 685/1147] blk.36.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 686/1147] blk.36.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.36.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 687/1147] blk.36.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.36.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 688/1147] blk.36.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.36.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 689/1147] blk.36.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 690/1147] blk.37.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 691/1147] blk.37.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 692/1147] blk.37.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.37.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 693/1147] blk.37.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.37.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 694/1147] blk.37.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.37.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 695/1147] blk.37.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 696/1147] blk.37.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 697/1147] blk.37.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 698/1147] blk.37.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.37.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 699/1147] blk.37.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 700/1147] blk.37.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.37.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 701/1147] blk.37.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 702/1147] blk.37.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 703/1147] blk.37.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 704/1147] blk.37.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 705/1147] blk.37.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.37.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 706/1147] blk.37.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.37.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 707/1147] blk.37.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.37.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 708/1147] blk.37.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 709/1147] blk.38.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 710/1147] blk.38.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 711/1147] blk.38.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.38.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 712/1147] blk.38.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.38.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 713/1147] blk.38.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.38.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 714/1147] blk.38.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 715/1147] blk.38.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 716/1147] blk.38.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 717/1147] blk.38.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.38.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 718/1147] blk.38.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 719/1147] blk.38.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.38.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 720/1147] blk.38.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 721/1147] blk.38.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 722/1147] blk.38.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 723/1147] blk.38.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 724/1147] blk.38.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.38.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 725/1147] blk.38.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.38.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 726/1147] blk.38.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.38.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 727/1147] blk.38.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 728/1147] blk.39.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 729/1147] blk.39.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 730/1147] blk.39.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.39.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 731/1147] blk.39.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.39.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 732/1147] blk.39.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.39.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 733/1147] blk.39.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 734/1147] blk.39.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 735/1147] blk.39.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 736/1147] blk.39.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.39.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 737/1147] blk.39.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 738/1147] blk.39.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.39.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 739/1147] blk.39.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 740/1147] blk.39.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 741/1147] blk.39.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 742/1147] blk.39.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 743/1147] blk.39.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.39.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 744/1147] blk.39.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.39.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 745/1147] blk.39.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.39.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 746/1147] blk.39.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 747/1147] blk.40.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 748/1147] blk.40.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 749/1147] blk.40.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.40.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 750/1147] blk.40.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.40.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 751/1147] blk.40.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.40.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 752/1147] blk.40.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 753/1147] blk.40.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 754/1147] blk.40.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 755/1147] blk.40.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.40.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 756/1147] blk.40.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 757/1147] blk.40.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.40.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 758/1147] blk.40.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 759/1147] blk.40.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 760/1147] blk.40.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 761/1147] blk.40.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 762/1147] blk.40.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.40.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 763/1147] blk.40.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.40.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 764/1147] blk.40.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.40.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 765/1147] blk.40.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 766/1147] blk.41.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 767/1147] blk.41.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 768/1147] blk.41.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.41.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 769/1147] blk.41.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.41.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 770/1147] blk.41.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.41.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 771/1147] blk.41.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 772/1147] blk.41.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 773/1147] blk.41.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 774/1147] blk.41.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.41.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 775/1147] blk.41.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 776/1147] blk.41.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.41.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 777/1147] blk.41.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 778/1147] blk.41.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 779/1147] blk.41.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 780/1147] blk.41.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 781/1147] blk.41.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.41.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 782/1147] blk.41.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.41.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 783/1147] blk.41.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.41.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 784/1147] blk.41.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 785/1147] blk.42.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 786/1147] blk.42.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 787/1147] blk.42.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.42.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 788/1147] blk.42.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.42.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 789/1147] blk.42.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.42.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 790/1147] blk.42.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 791/1147] blk.42.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 792/1147] blk.42.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 793/1147] blk.42.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.42.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 794/1147] blk.42.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 795/1147] blk.42.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.42.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 796/1147] blk.42.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 797/1147] blk.42.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 798/1147] blk.42.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 799/1147] blk.42.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 800/1147] blk.42.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.42.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 801/1147] blk.42.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.42.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 802/1147] blk.42.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.42.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 803/1147] blk.42.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 804/1147] blk.43.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 805/1147] blk.43.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 806/1147] blk.43.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.43.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 807/1147] blk.43.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.43.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 808/1147] blk.43.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.43.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 809/1147] blk.43.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 810/1147] blk.43.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 811/1147] blk.43.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 812/1147] blk.43.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.43.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 813/1147] blk.43.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 814/1147] blk.43.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.43.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 815/1147] blk.43.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 816/1147] blk.43.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 817/1147] blk.43.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 818/1147] blk.43.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 819/1147] blk.43.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.43.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 820/1147] blk.43.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.43.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 821/1147] blk.43.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.43.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 822/1147] blk.43.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 823/1147] blk.44.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 824/1147] blk.44.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 825/1147] blk.44.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.44.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 826/1147] blk.44.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.44.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 827/1147] blk.44.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.44.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 828/1147] blk.44.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 829/1147] blk.44.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 830/1147] blk.44.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 831/1147] blk.44.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.44.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 832/1147] blk.44.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 833/1147] blk.44.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.44.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 834/1147] blk.44.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 835/1147] blk.44.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 836/1147] blk.44.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 837/1147] blk.44.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 838/1147] blk.44.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.44.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 839/1147] blk.44.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.44.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 840/1147] blk.44.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.44.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 841/1147] blk.44.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 842/1147] blk.45.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 843/1147] blk.45.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 844/1147] blk.45.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.45.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 845/1147] blk.45.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.45.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 846/1147] blk.45.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.45.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 847/1147] blk.45.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 848/1147] blk.45.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 849/1147] blk.45.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 850/1147] blk.45.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.45.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 851/1147] blk.45.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 852/1147] blk.45.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.45.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 853/1147] blk.45.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 854/1147] blk.45.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 855/1147] blk.45.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 856/1147] blk.45.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 857/1147] blk.45.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.45.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 858/1147] blk.45.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.45.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 859/1147] blk.45.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.45.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 860/1147] blk.45.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 861/1147] blk.46.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 862/1147] blk.46.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 863/1147] blk.46.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.46.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 864/1147] blk.46.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.46.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 865/1147] blk.46.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.46.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 866/1147] blk.46.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 867/1147] blk.46.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 868/1147] blk.46.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 869/1147] blk.46.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.46.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 870/1147] blk.46.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 871/1147] blk.46.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.46.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 872/1147] blk.46.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 873/1147] blk.46.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 874/1147] blk.46.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 875/1147] blk.46.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 876/1147] blk.46.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.46.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 877/1147] blk.46.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.46.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 878/1147] blk.46.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.46.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 879/1147] blk.46.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 880/1147] blk.47.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 881/1147] blk.47.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 882/1147] blk.47.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.47.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 883/1147] blk.47.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.47.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 884/1147] blk.47.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.47.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 885/1147] blk.47.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 886/1147] blk.47.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 887/1147] blk.47.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 888/1147] blk.47.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.47.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 889/1147] blk.47.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 890/1147] blk.47.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.47.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 891/1147] blk.47.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 892/1147] blk.47.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 893/1147] blk.47.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 894/1147] blk.47.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 895/1147] blk.47.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.47.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 896/1147] blk.47.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.47.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 897/1147] blk.47.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.47.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 898/1147] blk.47.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 899/1147] blk.48.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 900/1147] blk.48.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 901/1147] blk.48.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.48.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 902/1147] blk.48.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.48.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 903/1147] blk.48.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.48.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 904/1147] blk.48.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 905/1147] blk.48.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 906/1147] blk.48.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 907/1147] blk.48.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.48.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 908/1147] blk.48.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 909/1147] blk.48.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.48.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 910/1147] blk.48.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 911/1147] blk.48.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 912/1147] blk.48.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 913/1147] blk.48.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 914/1147] blk.48.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.48.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 915/1147] blk.48.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.48.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 916/1147] blk.48.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.48.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 917/1147] blk.48.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 918/1147] blk.49.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 919/1147] blk.49.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 920/1147] blk.49.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.49.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 921/1147] blk.49.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.49.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 922/1147] blk.49.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.49.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 923/1147] blk.49.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 924/1147] blk.49.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 925/1147] blk.49.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 926/1147] blk.49.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.49.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 927/1147] blk.49.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 928/1147] blk.49.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.49.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 929/1147] blk.49.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 930/1147] blk.49.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 931/1147] blk.49.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 932/1147] blk.49.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 933/1147] blk.49.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.49.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 934/1147] blk.49.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.49.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 935/1147] blk.49.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.49.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 936/1147] blk.49.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 937/1147] blk.50.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 938/1147] blk.50.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 939/1147] blk.50.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.50.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 940/1147] blk.50.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.50.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 941/1147] blk.50.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.50.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 942/1147] blk.50.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 943/1147] blk.50.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 944/1147] blk.50.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 945/1147] blk.50.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.50.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 946/1147] blk.50.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 947/1147] blk.50.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.50.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 948/1147] blk.50.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 949/1147] blk.50.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 950/1147] blk.50.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 951/1147] blk.50.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 952/1147] blk.50.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.50.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 953/1147] blk.50.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.50.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 954/1147] blk.50.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.50.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 955/1147] blk.50.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 956/1147] blk.51.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 957/1147] blk.51.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 958/1147] blk.51.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.51.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 959/1147] blk.51.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.51.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 960/1147] blk.51.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.51.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 961/1147] blk.51.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 962/1147] blk.51.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 963/1147] blk.51.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 964/1147] blk.51.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.51.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 965/1147] blk.51.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 966/1147] blk.51.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.51.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 967/1147] blk.51.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 968/1147] blk.51.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 969/1147] blk.51.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 970/1147] blk.51.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 971/1147] blk.51.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.51.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 972/1147] blk.51.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.51.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 973/1147] blk.51.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.51.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 974/1147] blk.51.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 975/1147] blk.52.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 976/1147] blk.52.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 977/1147] blk.52.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.52.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 978/1147] blk.52.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.52.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 979/1147] blk.52.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.52.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 980/1147] blk.52.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 981/1147] blk.52.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 982/1147] blk.52.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 983/1147] blk.52.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.52.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 984/1147] blk.52.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 985/1147] blk.52.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.52.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 986/1147] blk.52.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 987/1147] blk.52.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 988/1147] blk.52.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 989/1147] blk.52.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 990/1147] blk.52.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.52.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 991/1147] blk.52.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.52.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 992/1147] blk.52.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.52.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 993/1147] blk.52.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 994/1147] blk.53.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 995/1147] blk.53.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 996/1147] blk.53.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.53.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 997/1147] blk.53.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.53.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 998/1147] blk.53.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.53.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 999/1147] blk.53.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1000/1147] blk.53.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1001/1147] blk.53.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1002/1147] blk.53.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.53.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1003/1147] blk.53.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1004/1147] blk.53.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.53.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1005/1147] blk.53.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1006/1147] blk.53.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1007/1147] blk.53.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1008/1147] blk.53.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1009/1147] blk.53.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.53.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1010/1147] blk.53.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.53.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1011/1147] blk.53.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.53.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1012/1147] blk.53.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1013/1147] blk.54.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1014/1147] blk.54.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1015/1147] blk.54.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.54.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1016/1147] blk.54.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.54.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1017/1147] blk.54.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.54.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1018/1147] blk.54.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1019/1147] blk.54.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1020/1147] blk.54.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1021/1147] blk.54.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.54.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1022/1147] blk.54.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1023/1147] blk.54.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.54.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1024/1147] blk.54.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1025/1147] blk.54.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1026/1147] blk.54.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1027/1147] blk.54.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1028/1147] blk.54.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.54.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1029/1147] blk.54.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.54.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1030/1147] blk.54.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.54.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1031/1147] blk.54.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1032/1147] blk.55.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1033/1147] blk.55.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1034/1147] blk.55.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.55.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1035/1147] blk.55.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.55.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1036/1147] blk.55.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.55.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1037/1147] blk.55.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1038/1147] blk.55.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1039/1147] blk.55.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1040/1147] blk.55.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.55.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1041/1147] blk.55.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1042/1147] blk.55.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.55.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1043/1147] blk.55.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1044/1147] blk.55.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1045/1147] blk.55.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1046/1147] blk.55.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1047/1147] blk.55.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.55.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1048/1147] blk.55.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.55.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1049/1147] blk.55.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.55.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1050/1147] blk.55.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1051/1147] blk.56.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1052/1147] blk.56.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1053/1147] blk.56.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.56.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1054/1147] blk.56.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.56.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1055/1147] blk.56.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.56.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1056/1147] blk.56.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1057/1147] blk.56.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1058/1147] blk.56.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1059/1147] blk.56.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.56.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1060/1147] blk.56.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1061/1147] blk.56.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.56.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1062/1147] blk.56.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1063/1147] blk.56.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1064/1147] blk.56.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1065/1147] blk.56.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1066/1147] blk.56.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.56.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1067/1147] blk.56.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.56.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1068/1147] blk.56.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.56.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1069/1147] blk.56.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1070/1147] blk.57.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1071/1147] blk.57.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1072/1147] blk.57.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.57.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1073/1147] blk.57.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.57.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1074/1147] blk.57.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.57.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1075/1147] blk.57.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1076/1147] blk.57.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1077/1147] blk.57.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1078/1147] blk.57.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.57.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1079/1147] blk.57.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1080/1147] blk.57.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.57.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1081/1147] blk.57.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1082/1147] blk.57.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1083/1147] blk.57.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1084/1147] blk.57.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1085/1147] blk.57.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.57.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1086/1147] blk.57.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.57.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1087/1147] blk.57.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.57.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1088/1147] blk.57.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1089/1147] blk.58.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1090/1147] blk.58.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1091/1147] blk.58.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.58.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1092/1147] blk.58.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.58.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1093/1147] blk.58.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.58.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1094/1147] blk.58.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1095/1147] blk.58.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1096/1147] blk.58.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1097/1147] blk.58.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.58.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1098/1147] blk.58.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1099/1147] blk.58.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.58.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1100/1147] blk.58.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1101/1147] blk.58.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1102/1147] blk.58.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1103/1147] blk.58.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1104/1147] blk.58.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.58.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1105/1147] blk.58.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.58.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1106/1147] blk.58.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.58.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1107/1147] blk.58.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1108/1147] blk.59.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1109/1147] blk.59.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1110/1147] blk.59.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.59.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1111/1147] blk.59.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.59.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1112/1147] blk.59.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.59.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1113/1147] blk.59.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1114/1147] blk.59.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1115/1147] blk.59.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1116/1147] blk.59.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.59.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1117/1147] blk.59.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1118/1147] blk.59.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.59.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1119/1147] blk.59.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1120/1147] blk.59.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1121/1147] blk.59.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1122/1147] blk.59.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1123/1147] blk.59.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.59.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1124/1147] blk.59.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.59.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1125/1147] blk.59.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.59.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1126/1147] blk.59.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1127/1147] blk.60.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1128/1147] blk.60.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1129/1147] blk.60.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.60.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1130/1147] blk.60.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.60.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1131/1147] blk.60.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.60.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1132/1147] blk.60.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1133/1147] blk.60.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1134/1147] blk.60.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1135/1147] blk.60.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.60.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1136/1147] blk.60.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1137/1147] blk.60.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.60.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1138/1147] blk.60.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1139/1147] blk.60.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1140/1147] blk.60.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1141/1147] output.weight - [ 7168, 129280, 1, 1], type = f16, Using custom type q6_K for tensor output.weight
====== llama_model_quantize_internal: did not find weights for output.weight
converting to q6_K .. size = 1767.50 MiB -> 724.95 MiB
[1142/1147] blk.60.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1143/1147] blk.60.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.60.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1144/1147] blk.60.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.60.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1145/1147] blk.60.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.60.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1146/1147] blk.60.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1147/1147] output_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
llama_model_quantize_internal: model size = 1282038.27 MB
llama_model_quantize_internal: quant size = 318818.01 MB
llama_model_quantize_internal: WARNING: 61 of 785 tensor(s) required fallback quantization
```
</details>
---
👤 **davidsyoung** commented the **2025-03-18** at **10:41:34**:<br>
<details>
<summary>PPL run</summary>
```
./build/bin/llama-perplexity -m /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf -f /models/wiki.test.raw -fmoe -mla 2 -fa -ts 24/24/24/24/24/24/24/24/24/24/24/24/24/24/24/24 -c 512 -ub 512 --n-gpu-layers 999 -ot "blk\.3\.ffn_(down|gate|up)_exps\.weight|blk\.4\.ffn_(down|gate|up)_exps\.weight|blk\.5\.ffn_(down|gate|up)_exps\.weight=CUDA0" -ot "blk\.6\.ffn_(down|gate|up)_exps\.weight|blk\.7\.ffn_(down|gate|up)_exps\.weight|blk\.8\.ffn_(down|gate|up)_exps\.weight=CUDA1" -ot "blk\.9\.ffn_(down|gate|up)_exps\.weight|blk\.10\.ffn_(down|gate|up)_exps\.weight|blk\.11\.ffn_(down|gate|up)_exps\.weight|blk\.12\.ffn_(down|gate|up)_exps\.weight=CUDA2" -ot "blk\.13\.ffn_(down|gate|up)_exps\.weight|blk\.14\.ffn_(down|gate|up)_exps\.weight|blk\.15\.ffn_(down|gate|up)_exps\.weight|blk\.16\.ffn_(down|gate|up)_exps\.weight=CUDA3" -ot "blk\.17\.ffn_(down|gate|up)_exps\.weight|blk\.18\.ffn_(down|gate|up)_exps\.weight|blk\.19\.ffn_(down|gate|up)_exps\.weight|blk\.20\.ffn_(down|gate|up)_exps\.weight=CUDA4" -ot "blk\.21\.ffn_(down|gate|up)_exps\.weight|blk\.22\.ffn_(down|gate|up)_exps\.weight|blk\.23\.ffn_(down|gate|up)_exps\.weight|blk\.24\.ffn_(down|gate|up)_exps\.weight=CUDA5" -ot "blk\.25\.ffn_(down|gate|up)_exps\.weight|blk\.26\.ffn_(down|gate|up)_exps\.weight|blk\.27\.ffn_(down|gate|up)_exps\.weight|blk\.28\.ffn_(down|gate|up)_exps\.weight=CUDA6" -ot "blk\.29\.ffn_(down|gate|up)_exps\.weight|blk\.30\.ffn_(down|gate|up)_exps\.weight|blk\.31\.ffn_(down|gate|up)_exps\.weight|blk\.32\.ffn_(down|gate|up)_exps\.weight=CUDA7" -ot "blk\.33\.ffn_(down|gate|up)_exps\.weight|blk\.34\.ffn_(down|gate|up)_exps\.weight|blk\.35\.ffn_(down|gate|up)_exps\.weight|blk\.36\.ffn_(down|gate|up)_exps\.weight=CUDA8" -ot "blk\.37\.ffn_(down|gate|up)_exps\.weight|blk\.38\.ffn_(down|gate|up)_exps\.weight|blk\.39\.ffn_(down|gate|up)_exps\.weight|blk\.40\.ffn_(down|gate|up)_exps\.weight=CUDA9" -ot "blk\.41\.ffn_(down|gate|up)_exps\.weight|blk\.42\.ffn_(down|gate|up)_exps\.weight|blk\.43\.ffn_(down|gate|up)_exps\.weight|blk\.44\.ffn_(down|gate|up)_exps\.weight=CUDA10" -ot "blk\.45\.ffn_(down|gate|up)_exps\.weight|blk\.46\.ffn_(down|gate|up)_exps\.weight|blk\.47\.ffn_(down|gate|up)_exps\.weight|blk\.48\.ffn_(down|gate|up)_exps\.weight=CUDA11" -ot "blk\.49\.ffn_(down|gate|up)_exps\.weight|blk\.50\.ffn_(down|gate|up)_exps\.weight|blk\.51\.ffn_(down|gate|up)_exps\.weight|blk\.52\.ffn_(down|gate|up)_exps\.weight=CUDA12" -ot "blk\.53\.ffn_(down|gate|up)_exps\.weight|blk\.54\.ffn_(down|gate|up)_exps\.weight|blk\.55\.ffn_(down|gate|up)_exps\.weight|blk\.56\.ffn_(down|gate|up)_exps\.weight=CUDA13" -ot "blk\.57\.ffn_(down|gate|up)_exps\.weight|blk\.58\.ffn_(down|gate|up)_exps\.weight|blk\.59\.ffn_(down|gate|up)_exps\.weight|blk\.60\.ffn_(down|gate|up)_exps\.weight=CUDA14" --seed 1741529602 --temp 0.5
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 16 CUDA devices:
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 3: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 4: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 5: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 6: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 7: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 8: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 9: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 10: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 11: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 12: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 13: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 14: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 15: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed = 1741529602
llama_model_loader: loaded meta data with 54 key-value pairs and 1147 tensors from /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = deepseek2
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = unsloth_DeepSeek R1 BF16
llama_model_loader: - kv 3: general.size_label str = 256x21B
llama_model_loader: - kv 4: general.license str = mit
llama_model_loader: - kv 5: general.base_model.count u32 = 1
llama_model_loader: - kv 6: general.base_model.0.name str = DeepSeek R1
llama_model_loader: - kv 7: general.base_model.0.organization str = Deepseek Ai
llama_model_loader: - kv 8: general.base_model.0.repo_url str = https://huggingface.co/deepseek-ai/De...
llama_model_loader: - kv 9: general.tags arr[str,3] = ["deepseek", "unsloth", "transformers"]
llama_model_loader: - kv 10: general.languages arr[str,1] = ["en"]
llama_model_loader: - kv 11: deepseek2.block_count u32 = 61
llama_model_loader: - kv 12: deepseek2.context_length u32 = 163840
llama_model_loader: - kv 13: deepseek2.embedding_length u32 = 7168
llama_model_loader: - kv 14: deepseek2.feed_forward_length u32 = 18432
llama_model_loader: - kv 15: deepseek2.attention.head_count u32 = 128
llama_model_loader: - kv 16: deepseek2.attention.head_count_kv u32 = 128
llama_model_loader: - kv 17: deepseek2.rope.freq_base f32 = 10000.000000
llama_model_loader: - kv 18: deepseek2.attention.layer_norm_rms_epsilon f32 = 0.000001
llama_model_loader: - kv 19: deepseek2.expert_used_count u32 = 8
llama_model_loader: - kv 20: general.file_type u32 = 18
llama_model_loader: - kv 21: deepseek2.leading_dense_block_count u32 = 3
llama_model_loader: - kv 22: deepseek2.vocab_size u32 = 129280
llama_model_loader: - kv 23: deepseek2.attention.q_lora_rank u32 = 1536
llama_model_loader: - kv 24: deepseek2.attention.kv_lora_rank u32 = 512
llama_model_loader: - kv 25: deepseek2.attention.key_length u32 = 192
llama_model_loader: - kv 26: deepseek2.attention.value_length u32 = 128
llama_model_loader: - kv 27: deepseek2.expert_feed_forward_length u32 = 2048
llama_model_loader: - kv 28: deepseek2.expert_count u32 = 256
llama_model_loader: - kv 29: deepseek2.expert_shared_count u32 = 1
llama_model_loader: - kv 30: deepseek2.expert_weights_scale f32 = 2.500000
llama_model_loader: - kv 31: deepseek2.expert_weights_norm bool = true
llama_model_loader: - kv 32: deepseek2.expert_gating_func u32 = 2
llama_model_loader: - kv 33: deepseek2.rope.dimension_count u32 = 64
llama_model_loader: - kv 34: deepseek2.rope.scaling.type str = yarn
llama_model_loader: - kv 35: deepseek2.rope.scaling.factor f32 = 40.000000
llama_model_loader: - kv 36: deepseek2.rope.scaling.original_context_length u32 = 4096
llama_model_loader: - kv 37: deepseek2.rope.scaling.yarn_log_multiplier f32 = 0.100000
llama_model_loader: - kv 38: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 39: tokenizer.ggml.pre str = deepseek-v3
llama_model_loader: - kv 40: tokenizer.ggml.tokens arr[str,129280] = ["<begin▁of▁sentence>", "<<3C>...
llama_model_loader: - kv 41: tokenizer.ggml.token_type arr[i32,129280] = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 42: tokenizer.ggml.merges arr[str,127741] = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv 43: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 44: tokenizer.ggml.eos_token_id u32 = 1
llama_model_loader: - kv 45: tokenizer.ggml.padding_token_id u32 = 128815
llama_model_loader: - kv 46: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 47: tokenizer.ggml.add_eos_token bool = false
llama_model_loader: - kv 48: tokenizer.chat_template str = {% if not add_generation_prompt is de...
llama_model_loader: - kv 49: general.quantization_version u32 = 2
llama_model_loader: - kv 50: quantize.imatrix.file str = /models/deepseek-config/imatrix.dat
llama_model_loader: - kv 51: quantize.imatrix.dataset str = imatrix-training-full-3
llama_model_loader: - kv 52: quantize.imatrix.entries_count i32 = 720
llama_model_loader: - kv 53: quantize.imatrix.chunks_count i32 = 315
llama_model_loader: - type f32: 361 tensors
llama_model_loader: - type q8_0: 62 tensors
llama_model_loader: - type q5_K: 6 tensors
llama_model_loader: - type q6_K: 550 tensors
llama_model_loader: - type iq3_k: 104 tensors
llama_model_loader: - type iq4_k: 64 tensors
llm_load_vocab: special tokens cache size = 819
llm_load_vocab: token to piece cache size = 0.8223 MB
llm_load_print_meta: format = GGUF V3 (latest)
llm_load_print_meta: arch = deepseek2
llm_load_print_meta: vocab type = BPE
llm_load_print_meta: n_vocab = 129280
llm_load_print_meta: n_merges = 127741
llm_load_print_meta: vocab_only = 0
llm_load_print_meta: n_ctx_train = 163840
llm_load_print_meta: n_embd = 7168
llm_load_print_meta: n_layer = 61
llm_load_print_meta: n_head = 128
llm_load_print_meta: n_head_kv = 128
llm_load_print_meta: n_rot = 64
llm_load_print_meta: n_swa = 0
llm_load_print_meta: n_embd_head_k = 192
llm_load_print_meta: n_embd_head_v = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: n_embd_k_gqa = 24576
llm_load_print_meta: n_embd_v_gqa = 16384
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-06
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale = 0.0e+00
llm_load_print_meta: n_ff = 18432
llm_load_print_meta: n_expert = 256
llm_load_print_meta: n_expert_used = 8
llm_load_print_meta: causal attn = 1
llm_load_print_meta: pooling type = 0
llm_load_print_meta: rope type = 0
llm_load_print_meta: rope scaling = yarn
llm_load_print_meta: freq_base_train = 10000.0
llm_load_print_meta: freq_scale_train = 0.025
llm_load_print_meta: n_ctx_orig_yarn = 4096
llm_load_print_meta: rope_finetuned = unknown
llm_load_print_meta: ssm_d_conv = 0
llm_load_print_meta: ssm_d_inner = 0
llm_load_print_meta: ssm_d_state = 0
llm_load_print_meta: ssm_dt_rank = 0
llm_load_print_meta: model type = 671B
llm_load_print_meta: model ftype = Q6_K
llm_load_print_meta: model params = 672.050 B
llm_load_print_meta: model size = 311.346 GiB (3.980 BPW)
llm_load_print_meta: repeating layers = 309.721 GiB (3.970 BPW, 670.196 B parameters)
llm_load_print_meta: general.name = unsloth_DeepSeek R1 BF16
llm_load_print_meta: BOS token = 0 '<begin▁of▁sentence>'
llm_load_print_meta: EOS token = 1 '<end▁of▁sentence>'
llm_load_print_meta: PAD token = 128815 '<PAD▁TOKEN>'
llm_load_print_meta: LF token = 131 'Ä'
llm_load_print_meta: max token length = 256
llm_load_print_meta: n_layer_dense_lead = 3
llm_load_print_meta: n_lora_q = 1536
llm_load_print_meta: n_lora_kv = 512
llm_load_print_meta: n_ff_exp = 2048
llm_load_print_meta: n_expert_shared = 1
llm_load_print_meta: expert_weights_scale = 2.5
llm_load_print_meta: expert_weights_norm = 1
llm_load_print_meta: expert_gating_func = sigmoid
llm_load_print_meta: rope_yarn_log_mul = 0.1000
llm_load_tensors: ggml ctx size = 7.94 MiB
Tensor blk.3.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.6.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.9.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.13.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.17.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.21.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.25.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.29.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.33.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.37.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.41.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.45.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.49.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.53.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.57.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_up_exps.weight buffer type overriden to CUDA14
llm_load_tensors: offloading 61 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 62/62 layers to GPU
llm_load_tensors: CPU buffer size = 938.98 MiB
llm_load_tensors: CUDA0 buffer size = 21105.69 MiB
llm_load_tensors: CUDA1 buffer size = 20299.82 MiB
llm_load_tensors: CUDA2 buffer size = 21195.82 MiB
llm_load_tensors: CUDA3 buffer size = 21195.82 MiB
llm_load_tensors: CUDA4 buffer size = 21195.82 MiB
llm_load_tensors: CUDA5 buffer size = 21195.82 MiB
llm_load_tensors: CUDA6 buffer size = 21195.82 MiB
llm_load_tensors: CUDA7 buffer size = 20992.86 MiB
llm_load_tensors: CUDA8 buffer size = 21195.82 MiB
llm_load_tensors: CUDA9 buffer size = 21195.82 MiB
llm_load_tensors: CUDA10 buffer size = 21195.82 MiB
llm_load_tensors: CUDA11 buffer size = 21195.82 MiB
llm_load_tensors: CUDA12 buffer size = 21195.82 MiB
llm_load_tensors: CUDA13 buffer size = 21195.82 MiB
llm_load_tensors: CUDA14 buffer size = 21195.82 MiB
llm_load_tensors: CUDA15 buffer size = 1130.89 MiB
....................................................................................................
llama_new_context_with_model: n_ctx = 2048
llama_new_context_with_model: n_batch = 2048
llama_new_context_with_model: n_ubatch = 512
llama_new_context_with_model: flash_attn = 1
llama_new_context_with_model: mla_attn = 2
llama_new_context_with_model: attn_max_b = 0
llama_new_context_with_model: fused_moe = 1
llama_new_context_with_model: ser = -1, 0
llama_new_context_with_model: freq_base = 10000.0
llama_new_context_with_model: freq_scale = 0.025
llama_kv_cache_init: layer 0: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 1: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 2: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 3: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 4: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 5: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 6: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 7: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 8: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 9: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 10: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 11: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 12: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 13: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 14: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 15: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 16: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 17: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 18: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 19: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 20: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 21: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 22: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 23: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 24: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 25: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 26: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 27: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 28: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 29: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 30: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 31: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 32: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 33: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 34: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 35: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 36: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 37: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 38: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 39: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 40: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 41: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 42: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 43: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 44: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 45: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 46: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 47: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 48: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 49: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 50: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 51: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 52: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 53: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 54: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 55: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 56: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 57: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 58: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 59: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 60: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: CUDA0 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA1 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA2 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA3 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA4 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA5 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA6 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA7 KV buffer size = 6.75 MiB
llama_kv_cache_init: CUDA8 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA9 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA10 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA11 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA12 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA13 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA14 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA15 KV buffer size = 4.50 MiB
llama_new_context_with_model: KV self size = 137.25 MiB, c^KV (f16): 137.25 MiB, kv^T: not used
llama_new_context_with_model: CUDA_Host output buffer size = 1.97 MiB
llama_new_context_with_model: CUDA0 compute buffer size = 455.00 MiB
llama_new_context_with_model: CUDA1 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA2 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA3 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA4 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA5 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA6 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA7 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA8 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA9 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA10 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA11 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA12 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA13 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA14 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA15 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 18.01 MiB
llama_new_context_with_model: graph nodes = 3487
llama_new_context_with_model: graph splits = 65
system_info: n_threads = 64 / 128 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
perplexity: tokenizing the input ..
perplexity: tokenization took 1167.48 ms
perplexity: calculating perplexity over 561 chunks, n_ctx=512, batch_size=2048, n_seq=4
perplexity: 22.93 seconds per pass - ETA 53.58 minutes
[1]2.5633,[2]3.3137,[3]nan,[4]nan,[5]nan,[6]nan,[7]nan,[8]nan,[9]nan,[10]nan,[11]nan,[12]nan,[13]nan,[14]nan,[15]nan,[16]nan,[17]nan,[18]nan,[19]nan,[20]nan,[21]nan,[22]nan,[23]nan,[24]nan,[25]nan,[26]nan,[27]nan,[28]nan,[29]nan,[30]nan,[31]nan,[32]nan,[33]nan,[34]nan,[35]nan,[36]nan,[37]nan,[38]nan,[39]nan,[40]nan,[41]nan,[42]nan,[43]nan,[44]nan,[45]nan,[46]nan,[47]nan,[48]nan,[49]nan,[50]nan,[51]nan,[52]nan,[53]nan,[54]nan,[55]nan,[56]nan,[57]nan,[58]nan,[59]nan,[60]nan,[61]nan,[62]nan,[63]nan,[64]nan,[65]nan,[66]nan,[67]nan,[68]nan,[69]nan,[70]nan,[71]nan,[72]nan,[73]nan,[74]nan,[75]nan,[76]nan,[77]nan,[78]nan,[79]nan,[80]nan,[81]nan,[82]nan,[83]nan,[84]nan,[85]nan,[86]nan,[87]nan,[88]nan,[89]nan,[90]nan,[91]nan,[92]nan,^C
```
</details>
---
👤 **ikawrakow** commented the **2025-03-18** at **10:45:08**:<br>
Did you enable the `GGML_CUDA_IQK_FORCE_BF16` option when building?
---
👤 **davidsyoung** commented the **2025-03-18** at **10:49:14**:<br>
D'oh. Back to the drawing board. Apologies! Will report back.
---
👤 **davidsyoung** commented the **2025-03-18** at **13:11:13**:<br>
Works! Great work!
<details>
<summary>Successful PPL run</summary>
```
root@f9b3ae98b5a1:/app# ./build/bin/llama-perplexity -m /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf -f /models/wiki.test.raw -fmoe -mla 2 -fa -ts 24/24/24/24/24/24/24/24/24/24/24/24/24/24/24/24 -c 512 -ub 512 --n-gpu-layers 999 -ot "blk\.3\.ffn_(down|gate|up)_exps\.weight|blk\.4\.ffn_(down|gate|up)_exps\.weight|blk\.5\.ffn_(down|gate|up)_exps\.weight=CUDA0" -ot "blk\.6\.ffn_(down|gate|up)_exps\.weight|blk\.7\.ffn_(down|gate|up)_exps\.weight|blk\.8\.ffn_(down|gate|up)_exps\.weight=CUDA1" -ot "blk\.9\.ffn_(down|gate|up)_exps\.weight|blk\.10\.ffn_(down|gate|up)_exps\.weight|blk\.11\.ffn_(down|gate|up)_exps\.weight|blk\.12\.ffn_(down|gate|up)_exps\.weight=CUDA2" -ot "blk\.13\.ffn_(down|gate|up)_exps\.weight|blk\.14\.ffn_(down|gate|up)_exps\.weight|blk\.15\.ffn_(down|gate|up)_exps\.weight|blk\.16\.ffn_(down|gate|up)_exps\.weight=CUDA3" -ot "blk\.17\.ffn_(down|gate|up)_exps\.weight|blk\.18\.ffn_(down|gate|up)_exps\.weight|blk\.19\.ffn_(down|gate|up)_exps\.weight|blk\.20\.ffn_(down|gate|up)_exps\.weight=CUDA4" -ot "blk\.21\.ffn_(down|gate|up)_exps\.weight|blk\.22\.ffn_(down|gate|up)_exps\.weight|blk\.23\.ffn_(down|gate|up)_exps\.weight|blk\.24\.ffn_(down|gate|up)_exps\.weight=CUDA5" -ot "blk\.25\.ffn_(down|gate|up)_exps\.weight|blk\.26\.ffn_(down|gate|up)_exps\.weight|blk\.27\.ffn_(down|gate|up)_exps\.weight|blk\.28\.ffn_(down|gate|up)_exps\.weight=CUDA6" -ot "blk\.29\.ffn_(down|gate|up)_exps\.weight|blk\.30\.ffn_(down|gate|up)_exps\.weight|blk\.31\.ffn_(down|gate|up)_exps\.weight|blk\.32\.ffn_(down|gate|up)_exps\.weight=CUDA7" -ot "blk\.33\.ffn_(down|gate|up)_exps\.weight|blk\.34\.ffn_(down|gate|up)_exps\.weight|blk\.35\.ffn_(down|gate|up)_exps\.weight|blk\.36\.ffn_(down|gate|up)_exps\.weight=CUDA8" -ot "blk\.37\.ffn_(down|gate|up)_exps\.weight|blk\.38\.ffn_(down|gate|up)_exps\.weight|blk\.39\.ffn_(down|gate|up)_exps\.weight|blk\.40\.ffn_(down|gate|up)_exps\.weight=CUDA9" -ot "blk\.41\.ffn_(down|gate|up)_exps\.weight|blk\.42\.ffn_(down|gate|up)_exps\.weight|blk\.43\.ffn_(down|gate|up)_exps\.weight|blk\.44\.ffn_(down|gate|up)_exps\.weight=CUDA10" -ot "blk\.45\.ffn_(down|gate|up)_exps\.weight|blk\.46\.ffn_(down|gate|up)_exps\.weight|blk\.47\.ffn_(down|gate|up)_exps\.weight|blk\.48\.ffn_(down|gate|up)_exps\.weight=CUDA11" -ot "blk\.49\.ffn_(down|gate|up)_exps\.weight|blk\.50\.ffn_(down|gate|up)_exps\.weight|blk\.51\.ffn_(down|gate|up)_exps\.weight|blk\.52\.ffn_(down|gate|up)_exps\.weight=CUDA12" -ot "blk\.53\.ffn_(down|gate|up)_exps\.weight|blk\.54\.ffn_(down|gate|up)_exps\.weight|blk\.55\.ffn_(down|gate|up)_exps\.weight|blk\.56\.ffn_(down|gate|up)_exps\.weight=CUDA13" -ot "blk\.57\.ffn_(down|gate|up)_exps\.weight|blk\.58\.ffn_(down|gate|up)_exps\.weight|blk\.59\.ffn_(down|gate|up)_exps\.weight|blk\.60\.ffn_(down|gate|up)_exps\.weight=CUDA14" --seed 1741529602 --temp 0.5
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 16 CUDA devices:
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 3: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 4: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 5: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 6: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 7: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 8: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 9: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 10: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 11: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 12: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 13: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 14: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 15: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed = 1741529602
llama_model_loader: loaded meta data with 54 key-value pairs and 1147 tensors from /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = deepseek2
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = unsloth_DeepSeek R1 BF16
llama_model_loader: - kv 3: general.size_label str = 256x21B
llama_model_loader: - kv 4: general.license str = mit
llama_model_loader: - kv 5: general.base_model.count u32 = 1
llama_model_loader: - kv 6: general.base_model.0.name str = DeepSeek R1
llama_model_loader: - kv 7: general.base_model.0.organization str = Deepseek Ai
llama_model_loader: - kv 8: general.base_model.0.repo_url str = https://huggingface.co/deepseek-ai/De...
llama_model_loader: - kv 9: general.tags arr[str,3] = ["deepseek", "unsloth", "transformers"]
llama_model_loader: - kv 10: general.languages arr[str,1] = ["en"]
llama_model_loader: - kv 11: deepseek2.block_count u32 = 61
llama_model_loader: - kv 12: deepseek2.context_length u32 = 163840
llama_model_loader: - kv 13: deepseek2.embedding_length u32 = 7168
llama_model_loader: - kv 14: deepseek2.feed_forward_length u32 = 18432
llama_model_loader: - kv 15: deepseek2.attention.head_count u32 = 128
llama_model_loader: - kv 16: deepseek2.attention.head_count_kv u32 = 128
llama_model_loader: - kv 17: deepseek2.rope.freq_base f32 = 10000.000000
llama_model_loader: - kv 18: deepseek2.attention.layer_norm_rms_epsilon f32 = 0.000001
llama_model_loader: - kv 19: deepseek2.expert_used_count u32 = 8
llama_model_loader: - kv 20: general.file_type u32 = 18
llama_model_loader: - kv 21: deepseek2.leading_dense_block_count u32 = 3
llama_model_loader: - kv 22: deepseek2.vocab_size u32 = 129280
llama_model_loader: - kv 23: deepseek2.attention.q_lora_rank u32 = 1536
llama_model_loader: - kv 24: deepseek2.attention.kv_lora_rank u32 = 512
llama_model_loader: - kv 25: deepseek2.attention.key_length u32 = 192
llama_model_loader: - kv 26: deepseek2.attention.value_length u32 = 128
llama_model_loader: - kv 27: deepseek2.expert_feed_forward_length u32 = 2048
llama_model_loader: - kv 28: deepseek2.expert_count u32 = 256
llama_model_loader: - kv 29: deepseek2.expert_shared_count u32 = 1
llama_model_loader: - kv 30: deepseek2.expert_weights_scale f32 = 2.500000
llama_model_loader: - kv 31: deepseek2.expert_weights_norm bool = true
llama_model_loader: - kv 32: deepseek2.expert_gating_func u32 = 2
llama_model_loader: - kv 33: deepseek2.rope.dimension_count u32 = 64
llama_model_loader: - kv 34: deepseek2.rope.scaling.type str = yarn
llama_model_loader: - kv 35: deepseek2.rope.scaling.factor f32 = 40.000000
llama_model_loader: - kv 36: deepseek2.rope.scaling.original_context_length u32 = 4096
llama_model_loader: - kv 37: deepseek2.rope.scaling.yarn_log_multiplier f32 = 0.100000
llama_model_loader: - kv 38: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 39: tokenizer.ggml.pre str = deepseek-v3
llama_model_loader: - kv 40: tokenizer.ggml.tokens arr[str,129280] = ["<begin▁of▁sentence>", "<<3C>...
llama_model_loader: - kv 41: tokenizer.ggml.token_type arr[i32,129280] = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 42: tokenizer.ggml.merges arr[str,127741] = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv 43: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 44: tokenizer.ggml.eos_token_id u32 = 1
llama_model_loader: - kv 45: tokenizer.ggml.padding_token_id u32 = 128815
llama_model_loader: - kv 46: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 47: tokenizer.ggml.add_eos_token bool = false
llama_model_loader: - kv 48: tokenizer.chat_template str = {% if not add_generation_prompt is de...
llama_model_loader: - kv 49: general.quantization_version u32 = 2
llama_model_loader: - kv 50: quantize.imatrix.file str = /models/deepseek-config/imatrix.dat
llama_model_loader: - kv 51: quantize.imatrix.dataset str = imatrix-training-full-3
llama_model_loader: - kv 52: quantize.imatrix.entries_count i32 = 720
llama_model_loader: - kv 53: quantize.imatrix.chunks_count i32 = 315
llama_model_loader: - type f32: 361 tensors
llama_model_loader: - type q8_0: 62 tensors
llama_model_loader: - type q5_K: 6 tensors
llama_model_loader: - type q6_K: 550 tensors
llama_model_loader: - type iq3_k: 104 tensors
llama_model_loader: - type iq4_k: 64 tensors
llm_load_vocab: special tokens cache size = 819
llm_load_vocab: token to piece cache size = 0.8223 MB
llm_load_print_meta: format = GGUF V3 (latest)
llm_load_print_meta: arch = deepseek2
llm_load_print_meta: vocab type = BPE
llm_load_print_meta: n_vocab = 129280
llm_load_print_meta: n_merges = 127741
llm_load_print_meta: vocab_only = 0
llm_load_print_meta: n_ctx_train = 163840
llm_load_print_meta: n_embd = 7168
llm_load_print_meta: n_layer = 61
llm_load_print_meta: n_head = 128
llm_load_print_meta: n_head_kv = 128
llm_load_print_meta: n_rot = 64
llm_load_print_meta: n_swa = 0
llm_load_print_meta: n_embd_head_k = 192
llm_load_print_meta: n_embd_head_v = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: n_embd_k_gqa = 24576
llm_load_print_meta: n_embd_v_gqa = 16384
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-06
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale = 0.0e+00
llm_load_print_meta: n_ff = 18432
llm_load_print_meta: n_expert = 256
llm_load_print_meta: n_expert_used = 8
llm_load_print_meta: causal attn = 1
llm_load_print_meta: pooling type = 0
llm_load_print_meta: rope type = 0
llm_load_print_meta: rope scaling = yarn
llm_load_print_meta: freq_base_train = 10000.0
llm_load_print_meta: freq_scale_train = 0.025
llm_load_print_meta: n_ctx_orig_yarn = 4096
llm_load_print_meta: rope_finetuned = unknown
llm_load_print_meta: ssm_d_conv = 0
llm_load_print_meta: ssm_d_inner = 0
llm_load_print_meta: ssm_d_state = 0
llm_load_print_meta: ssm_dt_rank = 0
llm_load_print_meta: model type = 671B
llm_load_print_meta: model ftype = Q6_K
llm_load_print_meta: model params = 672.050 B
llm_load_print_meta: model size = 311.346 GiB (3.980 BPW)
llm_load_print_meta: repeating layers = 309.721 GiB (3.970 BPW, 670.196 B parameters)
llm_load_print_meta: general.name = unsloth_DeepSeek R1 BF16
llm_load_print_meta: BOS token = 0 '<begin▁of▁sentence>'
llm_load_print_meta: EOS token = 1 '<end▁of▁sentence>'
llm_load_print_meta: PAD token = 128815 '<PAD▁TOKEN>'
llm_load_print_meta: LF token = 131 'Ä'
llm_load_print_meta: max token length = 256
llm_load_print_meta: n_layer_dense_lead = 3
llm_load_print_meta: n_lora_q = 1536
llm_load_print_meta: n_lora_kv = 512
llm_load_print_meta: n_ff_exp = 2048
llm_load_print_meta: n_expert_shared = 1
llm_load_print_meta: expert_weights_scale = 2.5
llm_load_print_meta: expert_weights_norm = 1
llm_load_print_meta: expert_gating_func = sigmoid
llm_load_print_meta: rope_yarn_log_mul = 0.1000
llm_load_tensors: ggml ctx size = 7.94 MiB
Tensor blk.3.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.6.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.9.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.13.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.17.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.21.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.25.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.29.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.33.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.37.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.41.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.45.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.49.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.53.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.57.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_up_exps.weight buffer type overriden to CUDA14
llm_load_tensors: offloading 61 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 62/62 layers to GPU
llm_load_tensors: CPU buffer size = 938.98 MiB
llm_load_tensors: CUDA0 buffer size = 21105.69 MiB
llm_load_tensors: CUDA1 buffer size = 20299.82 MiB
llm_load_tensors: CUDA2 buffer size = 21195.82 MiB
llm_load_tensors: CUDA3 buffer size = 21195.82 MiB
llm_load_tensors: CUDA4 buffer size = 21195.82 MiB
llm_load_tensors: CUDA5 buffer size = 21195.82 MiB
llm_load_tensors: CUDA6 buffer size = 21195.82 MiB
llm_load_tensors: CUDA7 buffer size = 20992.86 MiB
llm_load_tensors: CUDA8 buffer size = 21195.82 MiB
llm_load_tensors: CUDA9 buffer size = 21195.82 MiB
llm_load_tensors: CUDA10 buffer size = 21195.82 MiB
llm_load_tensors: CUDA11 buffer size = 21195.82 MiB
llm_load_tensors: CUDA12 buffer size = 21195.82 MiB
llm_load_tensors: CUDA13 buffer size = 21195.82 MiB
llm_load_tensors: CUDA14 buffer size = 21195.82 MiB
llm_load_tensors: CUDA15 buffer size = 1130.89 MiB
....................................................................................................
llama_new_context_with_model: n_ctx = 2048
llama_new_context_with_model: n_batch = 2048
llama_new_context_with_model: n_ubatch = 512
llama_new_context_with_model: flash_attn = 1
llama_new_context_with_model: mla_attn = 2
llama_new_context_with_model: attn_max_b = 0
llama_new_context_with_model: fused_moe = 1
llama_new_context_with_model: ser = -1, 0
llama_new_context_with_model: freq_base = 10000.0
llama_new_context_with_model: freq_scale = 0.025
llama_kv_cache_init: layer 0: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 1: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 2: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 3: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 4: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 5: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 6: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 7: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 8: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 9: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 10: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 11: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 12: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 13: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 14: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 15: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 16: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 17: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 18: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 19: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 20: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 21: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 22: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 23: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 24: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 25: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 26: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 27: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 28: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 29: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 30: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 31: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 32: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 33: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 34: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 35: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 36: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 37: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 38: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 39: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 40: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 41: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 42: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 43: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 44: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 45: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 46: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 47: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 48: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 49: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 50: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 51: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 52: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 53: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 54: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 55: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 56: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 57: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 58: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 59: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 60: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: CUDA0 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA1 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA2 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA3 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA4 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA5 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA6 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA7 KV buffer size = 6.75 MiB
llama_kv_cache_init: CUDA8 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA9 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA10 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA11 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA12 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA13 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA14 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA15 KV buffer size = 4.50 MiB
llama_new_context_with_model: KV self size = 137.25 MiB, c^KV (f16): 137.25 MiB, kv^T: not used
llama_new_context_with_model: CUDA_Host output buffer size = 1.97 MiB
llama_new_context_with_model: CUDA0 compute buffer size = 455.00 MiB
llama_new_context_with_model: CUDA1 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA2 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA3 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA4 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA5 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA6 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA7 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA8 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA9 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA10 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA11 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA12 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA13 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA14 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA15 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 18.01 MiB
llama_new_context_with_model: graph nodes = 3487
llama_new_context_with_model: graph splits = 65
system_info: n_threads = 64 / 128 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
perplexity: tokenizing the input ..
perplexity: tokenization took 1206.19 ms
perplexity: calculating perplexity over 561 chunks, n_ctx=512, batch_size=2048, n_seq=4
perplexity: 18.19 seconds per pass - ETA 42.52 minutes
[1]2.5586,[2]3.3168,[3]2.3936,[4]2.0039,[5]1.8142,[6]1.6702,[7]1.5775,[8]1.5075,[9]1.4595,[10]1.4201,[11]1.4070,[12]1.4370,[13]1.4496,[14]1.5781,[15]1.7114,[16]1.7697,[17]1.9326,[18]2.0621,[19]2.0268,[20]2.0155,[21]2.1268,[22]2.1007,[23]2.0732,[24]2.0870,[25]2.0591,[26]2.0345,[27]2.0835,[28]2.0909,[29]2.1415,[30]2.1734,[31]2.2082,[32]2.2263,[33]2.2655,[34]2.3099,[35]2.3604,[36]2.4126,[37]2.4474,[38]2.4946,[39]2.5347,[40]2.5951,[41]2.6395,[42]2.6509,[43]2.7003,[44]2.7157,[45]2.7953,[46]2.8467,[47]2.8028,[48]2.7555,[49]2.7335,[50]2.7524,[51]2.7983,[52]2.8127,[53]2.8648,[54]2.8779,[55]2.9093,[56]2.9418,[57]2.9568,[58]2.9947,[59]3.0068,[60]3.0541,[61]3.0963,[62]3.1488,[63]3.1812,[64]3.2268,[65]3.2363,[66]3.2208,[67]3.1983,[68]3.2313,[69]3.2274,[70]3.2453,[71]3.2632,[72]3.2785,[73]3.2926,[74]3.3158,[75]3.2951,[76]3.2468,[77]3.2033,[78]3.1978,[79]3.1752,[80]3.1585,[81]3.1220,[82]3.1263,[83]3.0939,[84]3.0572,[85]3.0226,[86]2.9979,[87]2.9913,[88]2.9620,[89]2.9446,[90]2.9189,[91]2.8900,[92]2.8640,[93]2.8371,[94]2.8118,[95]2.7877,[96]2.7867,[97]2.7933,[98]2.7781,[99]2.7607,[100]2.7637,[101]2.7557,[102]2.7735,[103]2.8013,[104]2.8209,[105]2.8182,[106]2.8415,[107]2.8660,[108]2.8867,[109]2.9204,[110]2.9543,[111]2.9739,[112]2.9475,[113]2.9354,[114]2.9137,[115]2.8968,[116]2.8835,[117]2.8601,[118]2.8389,[119]2.8175,[120]2.7984,[121]2.7821,[122]2.7637,[123]2.7472,[124]2.7279,[125]2.7105,[126]2.6936,[127]2.6807,[128]2.6722,[129]2.6625,[130]2.6503,[131]2.6437,[132]2.6510,[133]2.6603,[134]2.6677,[135]2.6788,[136]2.6953,[137]2.7118,[138]2.7200,[139]2.7321,[140]2.7326,[141]2.7339,[142]2.7328,[143]2.7330,[144]2.7294,[145]2.7202,[146]2.7186,[147]2.7227,[148]2.7223,[149]2.7235,[150]2.7177,[151]2.7155,[152]2.7123,[153]2.7083,[154]2.7085,[155]2.7128,[156]2.7143,[157]2.7202,[158]2.7292,[159]2.7310,[160]2.7400,[161]2.7484,[162]2.7577,[163]2.7627,[164]2.7832,[165]2.8070,[166]2.8242,[167]2.8363,[168]2.8608,[169]2.8836,[170]2.9055,[171]2.9290,[172]2.9125,[173]2.8952,[174]2.8821,[175]2.8691,[176]2.8562,[177]2.8448,[178]2.8317,[179]2.8177,[180]2.8216,[181]2.8357,[182]2.8509,[183]2.8654,[184]2.8795,[185]2.8898,[186]2.9065,[187]2.9222,[188]2.9363,[189]2.9468,[190]2.9471,[191]2.9542,[192]2.9580,[193]2.9635,[194]2.9830,[195]2.9918,[196]3.0051,[197]3.0148,[198]3.0188,[199]3.0241,[200]3.0233,[201]3.0385,[202]3.0337,[203]3.0388,[204]3.0422,[205]3.0420,[206]3.0443,[207]3.0531,[208]3.0634,[209]3.0728,[210]3.0731,[211]3.0685,[212]3.0687,[213]3.0762,[214]3.0784,[215]3.0843,[216]3.0848,[217]3.0807,[218]3.0809,[219]3.0820,[220]3.0810,[221]3.0813,[222]3.0813,[223]3.0817,[224]3.0867,[225]3.0886,[226]3.0803,[227]3.0781,[228]3.0803,[229]3.0848,[230]3.0911,[231]3.0974,[232]3.0887,[233]3.0807,[234]3.0811,[235]3.0794,[236]3.0880,[237]3.0963,[238]3.1062,[239]3.1160,[240]3.1252,[241]3.1364,[242]3.1510,[243]3.1645,[244]3.1728,[245]3.1840,[246]3.1943,[247]3.1934,[248]3.1892,[249]3.1868,[250]3.1800,[251]3.1777,[252]3.1798,[253]3.1834,[254]3.1905,[255]3.1969,[256]3.2006,[257]3.2030,[258]3.2042,[259]3.2075,[260]3.2096,[261]3.2107,[262]3.2097,[263]3.2156,[264]3.2179,[265]3.2182,[266]3.2202,[267]3.2229,[268]3.2268,[269]3.2301,[270]3.2294,[271]3.2280,[272]3.2208,[273]3.2207,[274]3.2138,[275]3.2031,[276]3.1926,[277]3.1942,[278]3.2046,[279]3.2111,[280]3.2190,[281]3.2263,[282]3.2325,[283]3.2389,[284]3.2453,[285]3.2590,[286]3.2612,[287]3.2646,[288]3.2693,[289]3.2719,[290]3.2635,[291]3.2541,[292]3.2530,[293]3.2522,[294]3.2497,[295]3.2474,[296]3.2496,[297]3.2501,[298]3.2552,[299]3.2615,[300]3.2648,[301]3.2686,[302]3.2711,[303]3.2731,[304]3.2725,[305]3.2842,[306]3.2918,[307]3.3028,[308]3.2915,[309]3.2864,[310]3.2767,[311]3.2808,[312]3.2834,[313]3.2905,[314]3.2929,[315]3.2962,[316]3.2976,[317]3.2993,[318]3.2998,[319]3.3001,[320]3.3043,[321]3.3045,[322]3.3063,[323]3.3130,[324]3.3136,[325]3.3189,[326]3.3235,[327]3.3277,[328]3.3307,[329]3.3323,[330]3.3385,[331]3.3425,[332]3.3470,[333]3.3456,[334]3.3455,[335]3.3460,[336]3.3460,[337]3.3469,[338]3.3473,[339]3.3498,[340]3.3535,[341]3.3589,[342]3.3677,[343]3.3770,[344]3.3823,[345]3.3740,[346]3.3661,[347]3.3608,[348]3.3532,[349]3.3494,[350]3.3475,[351]3.3521,[352]3.3673,[353]3.3764,[354]3.3896,[355]3.3983,[356]3.4036,[357]3.4156,[358]3.4254,[359]3.4285,[360]3.4348,[361]3.4443,[362]3.4531,[363]3.4589,[364]3.4653,[365]3.4718,[366]3.4826,[367]3.4915,[368]3.4983,[369]3.5063,[370]3.5149,[371]3.5286,[372]3.5377,[373]3.5409,[374]3.5444,[375]3.5493,[376]3.5624,[377]3.5738,[378]3.5767,[379]3.5761,[380]3.5727,
[381]3.5775,[382]3.5833,[383]3.5868,[384]3.5911,[385]3.5948,[386]3.6008,[387]3.6066,[388]3.6099,[389]3.5991,[390]3.5896,[391]3.5787,[392]3.5730,[393]3.5635,[394]3.5542,[395]3.5450,[396]3.5347,[397]3.5257,[398]3.5160,[399]3.5056,[400]3.4975,[401]3.4874,[402]3.4768,[403]3.4678,[404]3.4573,[405]3.4476,[406]3.4375,[407]3.4281,[408]3.4192,[409]3.4103,[410]3.4041,[411]3.4049,[412]3.4004,[413]3.4022,[414]3.4041,[415]3.4011,[416]3.4013,[417]3.4034,[418]3.3976,[419]3.3989,[420]3.3964,[421]3.3953,[422]3.3969,[423]3.3962,[424]3.4001,[425]3.3995,[426]3.4001,[427]3.3991,[428]3.4014,[429]3.4032,[430]3.4061,[431]3.4070,[432]3.4063,[433]3.4025,[434]3.4028,[435]3.3954,[436]3.3891,[437]3.3850,[438]3.3831,[439]3.3805,[440]3.3854,[441]3.3908,[442]3.3981,[443]3.3963,[444]3.3969,[445]3.3980,[446]3.4029,[447]3.4060,[448]3.4086,[449]3.4116,[450]3.4154,[451]3.4183,[452]3.4205,[453]3.4224,[454]3.4208,[455]3.4229,[456]3.4232,[457]3.4260,[458]3.4311,[459]3.4317,[460]3.4318,[461]3.4283,[462]3.4320,[463]3.4395,[464]3.4450,[465]3.4379,[466]3.4361,[467]3.4345,[468]3.4359,[469]3.4330,[470]3.4301,[471]3.4306,[472]3.4314,[473]3.4306,[474]3.4296,[475]3.4307,[476]3.4293,[477]3.4283,[478]3.4291,[479]3.4308,[480]3.4335,[481]3.4294,[482]3.4328,[483]3.4319,[484]3.4355,[485]3.4418,[486]3.4448,[487]3.4487,[488]3.4541,[489]3.4566,[490]3.4610,[491]3.4673,[492]3.4718,[493]3.4714,[494]3.4726,[495]3.4751,[496]3.4770,[497]3.4799,[498]3.4802,[499]3.4795,[500]3.4835,[501]3.4880,[502]3.4873,[503]3.4858,[504]3.4878,[505]3.4910,[506]3.4996,[507]3.5024,[508]3.5058,[509]3.4982,[510]3.4929,[511]3.4865,[512]3.4821,[513]3.4759,[514]3.4746,[515]3.4771,[516]3.4721,[517]3.4719,[518]3.4709,[519]3.4716,[520]3.4765,[521]3.4751,[522]3.4736,[523]3.4794,[524]3.4783,[525]3.4766,[526]3.4719,[527]3.4668,[528]3.4634,[529]3.4601,[530]3.4570,[531]3.4539,[532]3.4481,[533]3.4416,[534]3.4376,[535]3.4384,[536]3.4414,[537]3.4444,[538]3.4472,[539]3.4499,[540]3.4553,[541]3.4589,[542]3.4613,[543]3.4555,[544]3.4514,[545]3.4510,[546]3.4443,[547]3.4380,[548]3.4314,[549]3.4247,[550]3.4187,[551]3.4125,[552]3.4067,[553]3.4010,[554]3.3990,[555]3.3976,[556]3.4004,[557]3.4045,[558]3.4104,[559]3.4150,[560]3.4202,[561]3.4184,
Final estimate: PPL = 3.4184 +/- 0.01902
```
</details>