mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 09:09:50 +00:00
298 KiB
298 KiB
🐛 #262 - Fix #261
| Author | ikawrakow |
|---|---|
| State | ❌ Closed |
| Created | 2025-03-18 |
| Updated | 2025-03-18 |
💬 Conversation
👤 davidsyoung commented the 2025-03-18 at 10:41:29:
Unfortunately still getting NaNs under perplexity. I built the latest PR in regards q8_0 KV cache.
Quant command
./build/bin/llama-quantize --imatrix /models/deepseek-config/imatrix.dat \
--token-embedding-type q8_0 \
--attn-q-type q6_K \
--attn-k-type q6_K \
--attn-v-type q6_K \
--attn-qkv-type q6_K \
--attn-output-type q6_K \
--ffn-gate-type q6_K \
--ffn-down-type q6_K \
--ffn-up-type q6_K \
--custom-q "\.ffn_.*_shexp\.weight=q6_K,output\.weight=q6_K" \
--custom-q "blk\.3\.ffn_down_exps\.weight=q5_K,blk\.4\.ffn_down_exps\.weight=q5_K,blk\.5\.ffn_down_exps\.weight=q5_K,blk\.3\.ffn_up_exps\.weight=iq4_k,blk\.3\.ffn_gate_exps\.weight=iq4_k,blk\.4\.ffn_up_exps\.weight=iq4_k,blk\.4\.ffn_gate_exps\.weight=iq4_k,blk\.5\.ffn_up_exps\.weight=iq4_k,blk\.5\.ffn_gate_exps\.weight=iq4_k" \
--custom-q "blk\.6\.ffn_down_exps\.weight=q5_K,blk\.7\.ffn_down_exps\.weight=q5_K,blk\.8\.ffn_down_exps\.weight=q5_K,blk\.6\.ffn_up_exps\.weight=iq4_k,blk\.6\.ffn_gate_exps\.weight=iq4_k,blk\.7\.ffn_up_exps\.weight=iq4_k,blk\.7\.ffn_gate_exps\.weight=iq4_k,blk\.8\.ffn_up_exps\.weight=iq4_k,blk\.8\.ffn_gate_exps\.weight=iq4_k" \
--custom-q "blk\.9\.ffn_down_exps\.weight=iq4_k,blk\.10\.ffn_down_exps\.weight=iq4_k,blk\.11\.ffn_down_exps\.weight=iq4_k,blk\.12\.ffn_down_exps\.weight=iq4_k,blk\.9\.ffn_up_exps\.weight=iq3_k,blk\.9\.ffn_gate_exps\.weight=iq3_k,blk\.10\.ffn_up_exps\.weight=iq3_k,blk\.10\.ffn_gate_exps\.weight=iq3_k,blk\.11\.ffn_up_exps\.weight=iq3_k,blk\.11\.ffn_gate_exps\.weight=iq3_k,blk\.12\.ffn_up_exps\.weight=iq3_k,blk\.12\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.13\.ffn_down_exps\.weight=iq4_k,blk\.14\.ffn_down_exps\.weight=iq4_k,blk\.15\.ffn_down_exps\.weight=iq4_k,blk\.16\.ffn_down_exps\.weight=iq4_k,blk\.13\.ffn_up_exps\.weight=iq3_k,blk\.13\.ffn_gate_exps\.weight=iq3_k,blk\.14\.ffn_up_exps\.weight=iq3_k,blk\.14\.ffn_gate_exps\.weight=iq3_k,blk\.15\.ffn_up_exps\.weight=iq3_k,blk\.15\.ffn_gate_exps\.weight=iq3_k,blk\.16\.ffn_up_exps\.weight=iq3_k,blk\.16\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.17\.ffn_down_exps\.weight=iq4_k,blk\.18\.ffn_down_exps\.weight=iq4_k,blk\.19\.ffn_down_exps\.weight=iq4_k,blk\.20\.ffn_down_exps\.weight=iq4_k,blk\.17\.ffn_up_exps\.weight=iq3_k,blk\.17\.ffn_gate_exps\.weight=iq3_k,blk\.18\.ffn_up_exps\.weight=iq3_k,blk\.18\.ffn_gate_exps\.weight=iq3_k,blk\.19\.ffn_up_exps\.weight=iq3_k,blk\.19\.ffn_gate_exps\.weight=iq3_k,blk\.20\.ffn_up_exps\.weight=iq3_k,blk\.20\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.21\.ffn_down_exps\.weight=iq4_k,blk\.22\.ffn_down_exps\.weight=iq4_k,blk\.23\.ffn_down_exps\.weight=iq4_k,blk\.24\.ffn_down_exps\.weight=iq4_k,blk\.21\.ffn_up_exps\.weight=iq3_k,blk\.21\.ffn_gate_exps\.weight=iq3_k,blk\.22\.ffn_up_exps\.weight=iq3_k,blk\.22\.ffn_gate_exps\.weight=iq3_k,blk\.23\.ffn_up_exps\.weight=iq3_k,blk\.23\.ffn_gate_exps\.weight=iq3_k,blk\.24\.ffn_up_exps\.weight=iq3_k,blk\.24\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.25\.ffn_down_exps\.weight=iq4_k,blk\.26\.ffn_down_exps\.weight=iq4_k,blk\.27\.ffn_down_exps\.weight=iq4_k,blk\.28\.ffn_down_exps\.weight=iq4_k,blk\.25\.ffn_up_exps\.weight=iq3_k,blk\.25\.ffn_gate_exps\.weight=iq3_k,blk\.26\.ffn_up_exps\.weight=iq3_k,blk\.26\.ffn_gate_exps\.weight=iq3_k,blk\.27\.ffn_up_exps\.weight=iq3_k,blk\.27\.ffn_gate_exps\.weight=iq3_k,blk\.28\.ffn_up_exps\.weight=iq3_k,blk\.28\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.29\.ffn_down_exps\.weight=iq4_k,blk\.30\.ffn_down_exps\.weight=iq4_k,blk\.31\.ffn_down_exps\.weight=iq4_k,blk\.32\.ffn_down_exps\.weight=iq4_k,blk\.29\.ffn_up_exps\.weight=iq3_k,blk\.29\.ffn_gate_exps\.weight=iq3_k,blk\.30\.ffn_up_exps\.weight=iq3_k,blk\.30\.ffn_gate_exps\.weight=iq3_k,blk\.31\.ffn_up_exps\.weight=iq3_k,blk\.31\.ffn_gate_exps\.weight=iq3_k,blk\.32\.ffn_up_exps\.weight=iq3_k,blk\.32\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.33\.ffn_down_exps\.weight=iq4_k,blk\.34\.ffn_down_exps\.weight=iq4_k,blk\.35\.ffn_down_exps\.weight=iq4_k,blk\.36\.ffn_down_exps\.weight=iq4_k,blk\.33\.ffn_up_exps\.weight=iq3_k,blk\.33\.ffn_gate_exps\.weight=iq3_k,blk\.34\.ffn_up_exps\.weight=iq3_k,blk\.34\.ffn_gate_exps\.weight=iq3_k,blk\.35\.ffn_up_exps\.weight=iq3_k,blk\.35\.ffn_gate_exps\.weight=iq3_k,blk\.36\.ffn_up_exps\.weight=iq3_k,blk\.36\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.37\.ffn_down_exps\.weight=iq4_k,blk\.38\.ffn_down_exps\.weight=iq4_k,blk\.39\.ffn_down_exps\.weight=iq4_k,blk\.40\.ffn_down_exps\.weight=iq4_k,blk\.37\.ffn_up_exps\.weight=iq3_k,blk\.37\.ffn_gate_exps\.weight=iq3_k,blk\.38\.ffn_up_exps\.weight=iq3_k,blk\.38\.ffn_gate_exps\.weight=iq3_k,blk\.39\.ffn_up_exps\.weight=iq3_k,blk\.39\.ffn_gate_exps\.weight=iq3_k,blk\.40\.ffn_up_exps\.weight=iq3_k,blk\.40\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.41\.ffn_down_exps\.weight=iq4_k,blk\.42\.ffn_down_exps\.weight=iq4_k,blk\.43\.ffn_down_exps\.weight=iq4_k,blk\.44\.ffn_down_exps\.weight=iq4_k,blk\.41\.ffn_up_exps\.weight=iq3_k,blk\.41\.ffn_gate_exps\.weight=iq3_k,blk\.42\.ffn_up_exps\.weight=iq3_k,blk\.42\.ffn_gate_exps\.weight=iq3_k,blk\.43\.ffn_up_exps\.weight=iq3_k,blk\.43\.ffn_gate_exps\.weight=iq3_k,blk\.44\.ffn_up_exps\.weight=iq3_k,blk\.44\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.45\.ffn_down_exps\.weight=iq4_k,blk\.46\.ffn_down_exps\.weight=iq4_k,blk\.47\.ffn_down_exps\.weight=iq4_k,blk\.48\.ffn_down_exps\.weight=iq4_k,blk\.45\.ffn_up_exps\.weight=iq3_k,blk\.45\.ffn_gate_exps\.weight=iq3_k,blk\.46\.ffn_up_exps\.weight=iq3_k,blk\.46\.ffn_gate_exps\.weight=iq3_k,blk\.47\.ffn_up_exps\.weight=iq3_k,blk\.47\.ffn_gate_exps\.weight=iq3_k,blk\.48\.ffn_up_exps\.weight=iq3_k,blk\.48\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.49\.ffn_down_exps\.weight=iq4_k,blk\.50\.ffn_down_exps\.weight=iq4_k,blk\.51\.ffn_down_exps\.weight=iq4_k,blk\.52\.ffn_down_exps\.weight=iq4_k,blk\.49\.ffn_up_exps\.weight=iq3_k,blk\.49\.ffn_gate_exps\.weight=iq3_k,blk\.50\.ffn_up_exps\.weight=iq3_k,blk\.50\.ffn_gate_exps\.weight=iq3_k,blk\.51\.ffn_up_exps\.weight=iq3_k,blk\.51\.ffn_gate_exps\.weight=iq3_k,blk\.52\.ffn_up_exps\.weight=iq3_k,blk\.52\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.53\.ffn_down_exps\.weight=iq4_k,blk\.54\.ffn_down_exps\.weight=iq4_k,blk\.55\.ffn_down_exps\.weight=iq4_k,blk\.56\.ffn_down_exps\.weight=iq4_k,blk\.53\.ffn_up_exps\.weight=iq3_k,blk\.53\.ffn_gate_exps\.weight=iq3_k,blk\.54\.ffn_up_exps\.weight=iq3_k,blk\.54\.ffn_gate_exps\.weight=iq3_k,blk\.55\.ffn_up_exps\.weight=iq3_k,blk\.55\.ffn_gate_exps\.weight=iq3_k,blk\.56\.ffn_up_exps\.weight=iq3_k,blk\.56\.ffn_gate_exps\.weight=iq3_k" \
--custom-q "blk\.57\.ffn_down_exps\.weight=iq4_k,blk\.58\.ffn_down_exps\.weight=iq4_k,blk\.59\.ffn_down_exps\.weight=iq4_k,blk\.60\.ffn_down_exps\.weight=iq4_k,blk\.57\.ffn_up_exps\.weight=iq3_k,blk\.57\.ffn_gate_exps\.weight=iq3_k,blk\.58\.ffn_up_exps\.weight=iq3_k,blk\.58\.ffn_gate_exps\.weight=iq3_k,blk\.59\.ffn_up_exps\.weight=iq3_k,blk\.59\.ffn_gate_exps\.weight=iq3_k,blk\.60\.ffn_up_exps\.weight=iq3_k,blk\.60\.ffn_gate_exps\.weight=iq3_k" \
/storage/DeepSeek-R1-GGUF/unsloth_DeepSeek-R1-BF16-256x21B-F16-00001-of-00059.gguf \
/models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf \
q6_K 6
Quant command output
Adding custom rule \.ffn_.*_shexp\.weight -> q6_K
Adding custom rule output\.weight -> q6_K
Adding custom rule blk\.3\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.4\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.5\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.3\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.3\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.4\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.4\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.5\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.5\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.6\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.7\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.8\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.6\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.6\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.7\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.7\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.8\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.8\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.9\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.10\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.11\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.12\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.9\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.9\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.10\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.10\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.11\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.11\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.12\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.12\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.13\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.14\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.15\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.16\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.13\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.13\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.14\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.14\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.15\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.15\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.16\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.16\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.17\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.18\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.19\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.20\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.17\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.17\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.18\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.18\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.19\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.19\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.20\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.20\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.21\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.22\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.23\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.24\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.21\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.21\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.22\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.22\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.23\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.23\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.24\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.24\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.25\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.26\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.27\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.28\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.25\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.25\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.26\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.26\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.27\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.27\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.28\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.28\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.29\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.30\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.31\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.32\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.29\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.29\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.30\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.30\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.31\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.31\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.32\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.32\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.33\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.34\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.35\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.36\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.33\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.33\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.34\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.34\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.35\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.35\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.36\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.36\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.37\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.38\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.39\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.40\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.37\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.37\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.38\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.38\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.39\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.39\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.40\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.40\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.41\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.42\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.43\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.44\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.41\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.41\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.42\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.42\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.43\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.43\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.44\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.44\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.45\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.46\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.47\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.48\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.45\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.45\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.46\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.46\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.47\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.47\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.48\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.48\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.49\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.50\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.51\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.52\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.49\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.49\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.50\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.50\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.51\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.51\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.52\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.52\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.53\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.54\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.55\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.56\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.53\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.53\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.54\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.54\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.55\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.55\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.56\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.56\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.57\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.58\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.59\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.60\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.57\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.57\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.58\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.58\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.59\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.59\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.60\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.60\.ffn_gate_exps\.weight -> iq3_k
load_imatrix: imatrix dataset='imatrix-training-full-3'
load_imatrix: loaded 720 importance matrix entries from /models/deepseek-config/imatrix.dat computed on 315 chunks
prepare_imatrix: have 720 importance matrix entries
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '/storage/DeepSeek-R1-GGUF/unsloth_DeepSeek-R1-BF16-256x21B-F16-00001-of-00059.gguf' to '/models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf' as Q6_K using 64 threads
llama_model_loader: additional 58 GGUFs metadata loaded.
llama_model_loader: loaded meta data with 53 key-value pairs and 1147 tensors from /storage/DeepSeek-R1-GGUF/unsloth_DeepSeek-R1-BF16-256x21B-F16-00001-of-00059.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = deepseek2
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = unsloth_DeepSeek R1 BF16
llama_model_loader: - kv 3: general.size_label str = 256x21B
llama_model_loader: - kv 4: general.license str = mit
llama_model_loader: - kv 5: general.base_model.count u32 = 1
llama_model_loader: - kv 6: general.base_model.0.name str = DeepSeek R1
llama_model_loader: - kv 7: general.base_model.0.organization str = Deepseek Ai
llama_model_loader: - kv 8: general.base_model.0.repo_url str = https://huggingface.co/deepseek-ai/De...
llama_model_loader: - kv 9: general.tags arr[str,3] = ["deepseek", "unsloth", "transformers"]
llama_model_loader: - kv 10: general.languages arr[str,1] = ["en"]
llama_model_loader: - kv 11: deepseek2.block_count u32 = 61
llama_model_loader: - kv 12: deepseek2.context_length u32 = 163840
llama_model_loader: - kv 13: deepseek2.embedding_length u32 = 7168
llama_model_loader: - kv 14: deepseek2.feed_forward_length u32 = 18432
llama_model_loader: - kv 15: deepseek2.attention.head_count u32 = 128
llama_model_loader: - kv 16: deepseek2.attention.head_count_kv u32 = 128
llama_model_loader: - kv 17: deepseek2.rope.freq_base f32 = 10000.000000
llama_model_loader: - kv 18: deepseek2.attention.layer_norm_rms_epsilon f32 = 0.000001
llama_model_loader: - kv 19: deepseek2.expert_used_count u32 = 8
llama_model_loader: - kv 20: general.file_type u32 = 1
llama_model_loader: - kv 21: deepseek2.leading_dense_block_count u32 = 3
llama_model_loader: - kv 22: deepseek2.vocab_size u32 = 129280
llama_model_loader: - kv 23: deepseek2.attention.q_lora_rank u32 = 1536
llama_model_loader: - kv 24: deepseek2.attention.kv_lora_rank u32 = 512
llama_model_loader: - kv 25: deepseek2.attention.key_length u32 = 192
llama_model_loader: - kv 26: deepseek2.attention.value_length u32 = 128
llama_model_loader: - kv 27: deepseek2.expert_feed_forward_length u32 = 2048
llama_model_loader: - kv 28: deepseek2.expert_count u32 = 256
llama_model_loader: - kv 29: deepseek2.expert_shared_count u32 = 1
llama_model_loader: - kv 30: deepseek2.expert_weights_scale f32 = 2.500000
llama_model_loader: - kv 31: deepseek2.expert_weights_norm bool = true
llama_model_loader: - kv 32: deepseek2.expert_gating_func u32 = 2
llama_model_loader: - kv 33: deepseek2.rope.dimension_count u32 = 64
llama_model_loader: - kv 34: deepseek2.rope.scaling.type str = yarn
llama_model_loader: - kv 35: deepseek2.rope.scaling.factor f32 = 40.000000
llama_model_loader: - kv 36: deepseek2.rope.scaling.original_context_length u32 = 4096
llama_model_loader: - kv 37: deepseek2.rope.scaling.yarn_log_multiplier f32 = 0.100000
llama_model_loader: - kv 38: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 39: tokenizer.ggml.pre str = deepseek-v3
llama_model_loader: - kv 40: tokenizer.ggml.tokens arr[str,129280] = ["<|begin▁of▁sentence|>", "<<3C>...
llama_model_loader: - kv 41: tokenizer.ggml.token_type arr[i32,129280] = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 42: tokenizer.ggml.merges arr[str,127741] = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv 43: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 44: tokenizer.ggml.eos_token_id u32 = 1
llama_model_loader: - kv 45: tokenizer.ggml.padding_token_id u32 = 128815
llama_model_loader: - kv 46: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 47: tokenizer.ggml.add_eos_token bool = false
llama_model_loader: - kv 48: tokenizer.chat_template str = {% if not add_generation_prompt is de...
llama_model_loader: - kv 49: general.quantization_version u32 = 2
llama_model_loader: - kv 50: split.no u16 = 0
llama_model_loader: - kv 51: split.count u16 = 59
llama_model_loader: - kv 52: split.tensors.count i32 = 1147
llama_model_loader: - type f32: 361 tensors
llama_model_loader: - type f16: 786 tensors
================================ Have weights data with 720 entries
[ 1/1147] token_embd.weight - [ 7168, 129280, 1, 1], type = f16,
====== llama_model_quantize_internal: did not find weights for token_embd.weight
converting to q8_0 .. size = 1767.50 MiB -> 938.98 MiB
[ 2/1147] blk.0.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 3/1147] blk.0.ffn_down.weight - [18432, 7168, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 4/1147] blk.0.ffn_gate.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 5/1147] blk.0.ffn_up.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 6/1147] blk.0.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 7/1147] blk.0.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 8/1147] blk.0.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 9/1147] blk.0.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 10/1147] blk.0.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.0.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 11/1147] blk.0.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 12/1147] blk.0.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.0.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 13/1147] blk.0.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 14/1147] blk.0.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 15/1147] blk.0.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 16/1147] blk.1.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 17/1147] blk.1.ffn_down.weight - [18432, 7168, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 18/1147] blk.1.ffn_gate.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 19/1147] blk.1.ffn_up.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 20/1147] blk.1.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 21/1147] blk.1.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 22/1147] blk.1.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 23/1147] blk.1.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 24/1147] blk.1.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.1.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 25/1147] blk.1.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 26/1147] blk.1.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.1.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 27/1147] blk.1.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 28/1147] blk.1.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 29/1147] blk.1.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 30/1147] blk.2.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 31/1147] blk.2.ffn_down.weight - [18432, 7168, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 32/1147] blk.2.ffn_gate.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 33/1147] blk.2.ffn_up.weight - [ 7168, 18432, 1, 1], type = f16, converting to q6_K .. size = 252.00 MiB -> 103.36 MiB
[ 34/1147] blk.2.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 35/1147] blk.2.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 36/1147] blk.2.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 37/1147] blk.2.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 38/1147] blk.2.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.2.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 39/1147] blk.2.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 40/1147] blk.2.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.2.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 41/1147] blk.2.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 42/1147] blk.2.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 43/1147] blk.2.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 44/1147] blk.3.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 45/1147] blk.3.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 46/1147] blk.3.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.3.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 47/1147] blk.3.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.3.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 48/1147] blk.3.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.3.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 49/1147] blk.3.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 50/1147] blk.3.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 51/1147] blk.3.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 52/1147] blk.3.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.3.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 53/1147] blk.3.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 54/1147] blk.3.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.3.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 55/1147] blk.3.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 56/1147] blk.3.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 57/1147] blk.3.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 58/1147] blk.3.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 59/1147] blk.3.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.3.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 60/1147] blk.3.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.3.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 61/1147] blk.3.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.3.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 62/1147] blk.3.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 63/1147] blk.4.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 64/1147] blk.4.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 65/1147] blk.4.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.4.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 66/1147] blk.4.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.4.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 67/1147] blk.4.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.4.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 68/1147] blk.4.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 69/1147] blk.4.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 70/1147] blk.4.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 71/1147] blk.4.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.4.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 72/1147] blk.4.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 73/1147] blk.4.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.4.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 74/1147] blk.4.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 75/1147] blk.4.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 76/1147] blk.4.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 77/1147] blk.4.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 78/1147] blk.4.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.4.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 79/1147] blk.4.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.4.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 80/1147] blk.4.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.4.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 81/1147] blk.4.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 82/1147] blk.5.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 83/1147] blk.5.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 84/1147] blk.5.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 85/1147] blk.5.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.5.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 86/1147] blk.5.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 87/1147] blk.5.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.5.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 88/1147] blk.5.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 89/1147] blk.5.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 90/1147] blk.5.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 91/1147] blk.5.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 92/1147] blk.5.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 93/1147] blk.5.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.5.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 94/1147] blk.5.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.5.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 95/1147] blk.5.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.5.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 96/1147] blk.5.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 97/1147] blk.5.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.5.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 98/1147] blk.5.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.5.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 99/1147] blk.5.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.5.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 100/1147] blk.5.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 101/1147] blk.6.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 102/1147] blk.6.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 103/1147] blk.6.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.6.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 104/1147] blk.6.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.6.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 105/1147] blk.6.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.6.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 106/1147] blk.6.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 107/1147] blk.6.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 108/1147] blk.6.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 109/1147] blk.6.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.6.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 110/1147] blk.6.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 111/1147] blk.6.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.6.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 112/1147] blk.6.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 113/1147] blk.6.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 114/1147] blk.6.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 115/1147] blk.6.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 116/1147] blk.6.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.6.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 117/1147] blk.6.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.6.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 118/1147] blk.6.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.6.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 119/1147] blk.6.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 120/1147] blk.7.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 121/1147] blk.7.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 122/1147] blk.7.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.7.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 123/1147] blk.7.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.7.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 124/1147] blk.7.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.7.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 125/1147] blk.7.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 126/1147] blk.7.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 127/1147] blk.7.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 128/1147] blk.7.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.7.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 129/1147] blk.7.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 130/1147] blk.7.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.7.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 131/1147] blk.7.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 132/1147] blk.7.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 133/1147] blk.7.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 134/1147] blk.7.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 135/1147] blk.7.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.7.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 136/1147] blk.7.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.7.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 137/1147] blk.7.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.7.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 138/1147] blk.7.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 139/1147] blk.8.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 140/1147] blk.8.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 141/1147] blk.8.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.8.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 142/1147] blk.8.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.8.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 143/1147] blk.8.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.8.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 144/1147] blk.8.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 145/1147] blk.8.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 146/1147] blk.8.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 147/1147] blk.8.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.8.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 148/1147] blk.8.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 149/1147] blk.8.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.8.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 150/1147] blk.8.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 151/1147] blk.8.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 152/1147] blk.8.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 153/1147] blk.8.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 154/1147] blk.8.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type q5_K for tensor blk.8.ffn_down_exps.weight
converting to q5_K .. size = 7168.00 MiB -> 2464.00 MiB
[ 155/1147] blk.8.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.8.ffn_gate_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 156/1147] blk.8.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq4_k for tensor blk.8.ffn_up_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 157/1147] blk.8.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 158/1147] blk.9.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 159/1147] blk.9.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 160/1147] blk.9.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.9.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 161/1147] blk.9.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.9.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 162/1147] blk.9.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.9.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 163/1147] blk.9.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 164/1147] blk.9.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 165/1147] blk.9.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 166/1147] blk.9.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.9.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 167/1147] blk.9.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 168/1147] blk.9.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.9.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 169/1147] blk.9.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 170/1147] blk.9.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 171/1147] blk.9.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 172/1147] blk.10.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 173/1147] blk.10.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 174/1147] blk.10.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.10.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 175/1147] blk.10.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.10.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 176/1147] blk.10.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.10.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 177/1147] blk.10.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 178/1147] blk.10.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 179/1147] blk.10.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 180/1147] blk.10.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.10.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 181/1147] blk.10.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 182/1147] blk.10.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.10.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 183/1147] blk.10.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 184/1147] blk.10.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 185/1147] blk.10.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 186/1147] blk.9.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 187/1147] blk.9.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.9.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 188/1147] blk.9.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.9.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 189/1147] blk.9.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.9.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 190/1147] blk.9.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 191/1147] blk.10.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 192/1147] blk.10.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.10.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 193/1147] blk.10.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.10.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 194/1147] blk.10.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.10.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 195/1147] blk.10.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 196/1147] blk.11.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 197/1147] blk.11.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 198/1147] blk.11.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.11.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 199/1147] blk.11.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.11.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 200/1147] blk.11.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.11.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 201/1147] blk.11.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 202/1147] blk.11.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 203/1147] blk.11.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 204/1147] blk.11.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.11.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 205/1147] blk.11.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 206/1147] blk.11.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.11.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 207/1147] blk.11.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 208/1147] blk.11.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 209/1147] blk.11.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 210/1147] blk.11.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 211/1147] blk.11.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.11.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 212/1147] blk.11.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.11.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 213/1147] blk.11.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.11.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 214/1147] blk.11.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 215/1147] blk.12.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 216/1147] blk.12.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 217/1147] blk.12.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.12.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 218/1147] blk.12.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.12.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 219/1147] blk.12.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.12.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 220/1147] blk.12.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 221/1147] blk.12.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 222/1147] blk.12.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 223/1147] blk.12.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.12.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 224/1147] blk.12.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 225/1147] blk.12.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.12.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 226/1147] blk.12.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 227/1147] blk.12.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 228/1147] blk.12.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 229/1147] blk.12.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 230/1147] blk.12.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.12.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 231/1147] blk.12.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.12.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 232/1147] blk.12.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.12.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 233/1147] blk.12.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 234/1147] blk.13.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 235/1147] blk.13.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 236/1147] blk.13.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.13.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 237/1147] blk.13.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.13.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 238/1147] blk.13.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.13.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 239/1147] blk.13.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 240/1147] blk.13.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 241/1147] blk.13.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 242/1147] blk.13.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.13.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 243/1147] blk.13.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 244/1147] blk.13.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.13.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 245/1147] blk.13.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 246/1147] blk.13.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 247/1147] blk.13.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 248/1147] blk.13.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 249/1147] blk.13.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.13.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 250/1147] blk.13.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.13.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 251/1147] blk.13.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.13.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 252/1147] blk.13.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 253/1147] blk.14.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 254/1147] blk.14.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 255/1147] blk.14.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.14.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 256/1147] blk.14.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.14.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 257/1147] blk.14.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.14.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 258/1147] blk.14.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 259/1147] blk.14.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 260/1147] blk.14.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 261/1147] blk.14.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.14.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 262/1147] blk.14.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 263/1147] blk.14.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.14.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 264/1147] blk.14.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 265/1147] blk.14.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 266/1147] blk.14.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 267/1147] blk.14.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 268/1147] blk.14.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.14.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 269/1147] blk.14.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.14.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 270/1147] blk.14.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.14.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 271/1147] blk.14.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 272/1147] blk.15.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 273/1147] blk.15.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 274/1147] blk.15.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.15.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 275/1147] blk.15.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.15.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 276/1147] blk.15.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.15.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 277/1147] blk.15.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 278/1147] blk.15.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 279/1147] blk.15.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 280/1147] blk.15.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.15.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 281/1147] blk.15.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 282/1147] blk.15.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.15.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 283/1147] blk.15.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 284/1147] blk.15.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 285/1147] blk.15.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 286/1147] blk.15.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 287/1147] blk.15.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.15.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 288/1147] blk.15.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.15.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 289/1147] blk.15.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.15.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 290/1147] blk.15.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 291/1147] blk.16.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 292/1147] blk.16.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 293/1147] blk.16.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.16.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 294/1147] blk.16.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.16.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 295/1147] blk.16.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.16.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 296/1147] blk.16.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 297/1147] blk.16.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 298/1147] blk.16.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 299/1147] blk.16.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.16.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 300/1147] blk.16.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 301/1147] blk.16.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.16.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 302/1147] blk.16.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 303/1147] blk.16.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 304/1147] blk.16.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 305/1147] blk.16.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 306/1147] blk.16.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.16.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 307/1147] blk.16.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.16.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 308/1147] blk.16.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.16.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 309/1147] blk.16.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 310/1147] blk.17.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 311/1147] blk.17.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 312/1147] blk.17.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.17.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 313/1147] blk.17.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.17.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 314/1147] blk.17.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.17.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 315/1147] blk.17.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 316/1147] blk.17.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 317/1147] blk.17.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 318/1147] blk.17.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.17.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 319/1147] blk.17.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 320/1147] blk.17.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.17.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 321/1147] blk.17.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 322/1147] blk.17.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 323/1147] blk.17.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 324/1147] blk.17.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 325/1147] blk.17.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.17.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 326/1147] blk.17.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.17.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 327/1147] blk.17.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.17.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 328/1147] blk.17.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 329/1147] blk.18.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 330/1147] blk.18.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 331/1147] blk.18.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.18.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 332/1147] blk.18.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.18.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 333/1147] blk.18.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.18.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 334/1147] blk.18.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 335/1147] blk.18.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 336/1147] blk.18.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 337/1147] blk.18.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.18.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 338/1147] blk.18.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 339/1147] blk.18.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.18.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 340/1147] blk.18.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 341/1147] blk.18.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 342/1147] blk.18.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 343/1147] blk.18.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 344/1147] blk.18.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.18.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 345/1147] blk.18.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.18.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 346/1147] blk.18.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.18.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 347/1147] blk.18.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 348/1147] blk.19.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 349/1147] blk.19.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 350/1147] blk.19.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.19.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 351/1147] blk.19.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.19.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 352/1147] blk.19.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.19.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 353/1147] blk.19.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 354/1147] blk.19.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 355/1147] blk.19.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 356/1147] blk.19.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.19.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 357/1147] blk.19.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 358/1147] blk.19.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.19.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 359/1147] blk.19.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 360/1147] blk.19.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 361/1147] blk.19.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 362/1147] blk.19.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 363/1147] blk.19.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.19.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 364/1147] blk.19.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.19.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 365/1147] blk.19.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.19.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 366/1147] blk.19.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 367/1147] blk.20.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 368/1147] blk.20.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 369/1147] blk.20.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.20.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 370/1147] blk.20.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.20.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 371/1147] blk.20.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.20.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 372/1147] blk.20.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 373/1147] blk.20.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 374/1147] blk.20.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 375/1147] blk.20.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.20.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 376/1147] blk.20.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 377/1147] blk.20.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.20.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 378/1147] blk.20.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 379/1147] blk.20.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 380/1147] blk.20.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 381/1147] blk.20.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 382/1147] blk.20.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.20.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 383/1147] blk.20.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.20.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 384/1147] blk.20.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.20.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 385/1147] blk.20.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 386/1147] blk.21.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 387/1147] blk.21.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 388/1147] blk.21.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.21.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 389/1147] blk.21.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.21.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 390/1147] blk.21.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.21.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 391/1147] blk.21.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 392/1147] blk.21.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 393/1147] blk.21.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 394/1147] blk.21.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.21.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 395/1147] blk.21.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 396/1147] blk.21.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.21.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 397/1147] blk.21.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 398/1147] blk.21.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 399/1147] blk.21.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 400/1147] blk.21.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 401/1147] blk.21.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.21.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 402/1147] blk.21.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.21.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 403/1147] blk.21.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.21.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 404/1147] blk.21.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 405/1147] blk.22.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 406/1147] blk.22.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 407/1147] blk.22.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.22.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 408/1147] blk.22.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.22.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 409/1147] blk.22.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.22.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 410/1147] blk.22.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 411/1147] blk.22.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 412/1147] blk.22.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 413/1147] blk.22.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.22.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 414/1147] blk.22.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 415/1147] blk.22.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.22.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 416/1147] blk.22.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 417/1147] blk.22.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 418/1147] blk.22.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 419/1147] blk.22.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 420/1147] blk.22.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.22.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 421/1147] blk.22.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.22.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 422/1147] blk.22.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.22.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 423/1147] blk.22.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 424/1147] blk.23.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 425/1147] blk.23.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 426/1147] blk.23.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.23.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 427/1147] blk.23.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.23.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 428/1147] blk.23.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.23.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 429/1147] blk.23.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 430/1147] blk.23.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 431/1147] blk.23.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 432/1147] blk.23.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.23.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 433/1147] blk.23.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 434/1147] blk.23.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.23.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 435/1147] blk.23.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 436/1147] blk.23.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 437/1147] blk.23.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 438/1147] blk.23.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 439/1147] blk.23.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.23.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 440/1147] blk.23.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.23.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 441/1147] blk.23.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.23.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 442/1147] blk.23.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 443/1147] blk.24.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 444/1147] blk.24.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 445/1147] blk.24.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.24.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 446/1147] blk.24.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.24.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 447/1147] blk.24.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.24.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 448/1147] blk.24.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 449/1147] blk.24.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 450/1147] blk.24.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 451/1147] blk.24.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.24.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 452/1147] blk.24.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 453/1147] blk.24.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.24.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 454/1147] blk.24.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 455/1147] blk.24.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 456/1147] blk.24.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 457/1147] blk.24.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 458/1147] blk.24.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.24.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 459/1147] blk.24.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.24.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 460/1147] blk.24.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.24.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 461/1147] blk.24.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 462/1147] blk.25.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 463/1147] blk.25.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 464/1147] blk.25.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.25.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 465/1147] blk.25.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.25.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 466/1147] blk.25.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.25.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 467/1147] blk.25.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 468/1147] blk.25.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 469/1147] blk.25.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 470/1147] blk.25.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.25.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 471/1147] blk.25.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 472/1147] blk.25.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.25.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 473/1147] blk.25.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 474/1147] blk.25.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 475/1147] blk.25.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 476/1147] blk.25.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 477/1147] blk.25.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.25.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 478/1147] blk.25.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.25.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 479/1147] blk.25.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.25.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 480/1147] blk.25.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 481/1147] blk.26.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 482/1147] blk.26.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 483/1147] blk.26.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.26.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 484/1147] blk.26.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.26.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 485/1147] blk.26.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.26.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 486/1147] blk.26.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 487/1147] blk.26.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 488/1147] blk.26.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 489/1147] blk.26.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.26.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 490/1147] blk.26.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 491/1147] blk.26.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.26.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 492/1147] blk.26.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 493/1147] blk.26.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 494/1147] blk.26.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 495/1147] blk.26.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 496/1147] blk.26.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.26.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 497/1147] blk.26.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.26.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 498/1147] blk.26.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.26.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 499/1147] blk.26.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 500/1147] blk.27.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 501/1147] blk.27.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 502/1147] blk.27.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.27.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 503/1147] blk.27.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.27.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 504/1147] blk.27.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.27.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 505/1147] blk.27.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 506/1147] blk.27.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 507/1147] blk.27.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 508/1147] blk.27.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.27.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 509/1147] blk.27.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 510/1147] blk.27.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.27.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 511/1147] blk.27.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 512/1147] blk.27.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 513/1147] blk.27.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 514/1147] blk.27.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 515/1147] blk.27.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.27.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 516/1147] blk.27.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.27.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 517/1147] blk.27.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.27.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 518/1147] blk.27.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 519/1147] blk.28.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 520/1147] blk.28.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 521/1147] blk.28.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.28.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 522/1147] blk.28.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.28.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 523/1147] blk.28.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.28.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 524/1147] blk.28.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 525/1147] blk.28.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 526/1147] blk.28.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 527/1147] blk.28.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.28.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 528/1147] blk.28.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 529/1147] blk.28.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.28.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 530/1147] blk.28.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 531/1147] blk.28.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 532/1147] blk.28.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 533/1147] blk.28.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 534/1147] blk.28.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.28.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 535/1147] blk.28.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.28.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 536/1147] blk.28.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.28.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 537/1147] blk.28.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 538/1147] blk.29.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 539/1147] blk.29.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 540/1147] blk.29.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.29.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 541/1147] blk.29.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.29.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 542/1147] blk.29.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.29.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 543/1147] blk.29.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 544/1147] blk.29.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 545/1147] blk.29.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 546/1147] blk.29.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.29.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 547/1147] blk.29.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 548/1147] blk.29.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.29.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 549/1147] blk.29.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 550/1147] blk.29.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 551/1147] blk.29.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 552/1147] blk.29.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 553/1147] blk.29.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.29.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 554/1147] blk.29.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.29.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 555/1147] blk.29.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.29.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 556/1147] blk.29.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 557/1147] blk.30.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 558/1147] blk.30.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 559/1147] blk.30.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.30.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 560/1147] blk.30.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.30.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 561/1147] blk.30.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.30.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 562/1147] blk.30.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 563/1147] blk.30.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 564/1147] blk.30.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 565/1147] blk.30.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.30.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 566/1147] blk.30.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 567/1147] blk.30.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.30.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 568/1147] blk.30.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 569/1147] blk.30.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 570/1147] blk.30.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 571/1147] blk.30.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 572/1147] blk.30.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.30.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 573/1147] blk.30.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.30.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 574/1147] blk.30.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.30.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 575/1147] blk.30.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 576/1147] blk.31.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 577/1147] blk.31.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 578/1147] blk.31.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.31.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 579/1147] blk.31.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.31.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 580/1147] blk.31.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.31.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 581/1147] blk.31.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 582/1147] blk.31.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 583/1147] blk.31.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 584/1147] blk.31.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.31.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 585/1147] blk.31.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 586/1147] blk.31.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.31.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 587/1147] blk.31.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 588/1147] blk.31.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 589/1147] blk.31.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 590/1147] blk.31.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 591/1147] blk.31.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.31.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 592/1147] blk.31.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.31.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 593/1147] blk.31.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.31.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 594/1147] blk.31.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 595/1147] blk.32.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 596/1147] blk.32.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 597/1147] blk.32.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.32.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 598/1147] blk.32.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.32.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 599/1147] blk.32.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.32.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 600/1147] blk.32.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 601/1147] blk.32.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 602/1147] blk.32.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 603/1147] blk.32.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.32.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 604/1147] blk.32.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 605/1147] blk.32.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.32.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 606/1147] blk.32.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 607/1147] blk.32.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 608/1147] blk.32.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 609/1147] blk.32.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 610/1147] blk.32.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.32.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 611/1147] blk.32.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.32.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 612/1147] blk.32.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.32.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 613/1147] blk.32.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 614/1147] blk.33.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 615/1147] blk.33.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 616/1147] blk.33.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.33.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 617/1147] blk.33.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.33.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 618/1147] blk.33.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.33.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 619/1147] blk.33.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 620/1147] blk.33.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 621/1147] blk.33.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 622/1147] blk.33.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.33.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 623/1147] blk.33.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 624/1147] blk.33.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.33.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 625/1147] blk.33.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 626/1147] blk.33.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 627/1147] blk.33.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 628/1147] blk.33.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 629/1147] blk.33.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.33.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 630/1147] blk.33.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.33.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 631/1147] blk.33.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.33.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 632/1147] blk.33.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 633/1147] blk.34.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 634/1147] blk.34.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 635/1147] blk.34.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.34.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 636/1147] blk.34.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.34.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 637/1147] blk.34.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.34.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 638/1147] blk.34.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 639/1147] blk.34.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 640/1147] blk.34.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 641/1147] blk.34.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.34.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 642/1147] blk.34.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 643/1147] blk.34.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.34.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 644/1147] blk.34.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 645/1147] blk.34.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 646/1147] blk.34.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 647/1147] blk.34.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 648/1147] blk.34.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.34.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 649/1147] blk.34.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.34.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 650/1147] blk.34.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.34.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 651/1147] blk.34.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 652/1147] blk.35.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 653/1147] blk.35.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 654/1147] blk.35.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.35.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 655/1147] blk.35.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.35.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 656/1147] blk.35.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.35.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 657/1147] blk.35.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 658/1147] blk.35.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 659/1147] blk.35.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 660/1147] blk.35.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.35.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 661/1147] blk.35.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 662/1147] blk.35.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.35.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 663/1147] blk.35.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 664/1147] blk.35.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 665/1147] blk.35.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 666/1147] blk.35.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 667/1147] blk.35.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.35.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 668/1147] blk.35.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.35.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 669/1147] blk.35.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.35.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 670/1147] blk.35.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 671/1147] blk.36.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 672/1147] blk.36.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 673/1147] blk.36.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.36.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 674/1147] blk.36.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.36.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 675/1147] blk.36.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.36.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 676/1147] blk.36.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 677/1147] blk.36.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 678/1147] blk.36.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 679/1147] blk.36.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.36.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 680/1147] blk.36.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 681/1147] blk.36.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.36.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 682/1147] blk.36.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 683/1147] blk.36.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 684/1147] blk.36.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 685/1147] blk.36.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 686/1147] blk.36.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.36.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 687/1147] blk.36.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.36.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 688/1147] blk.36.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.36.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 689/1147] blk.36.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 690/1147] blk.37.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 691/1147] blk.37.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 692/1147] blk.37.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.37.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 693/1147] blk.37.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.37.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 694/1147] blk.37.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.37.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 695/1147] blk.37.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 696/1147] blk.37.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 697/1147] blk.37.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 698/1147] blk.37.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.37.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 699/1147] blk.37.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 700/1147] blk.37.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.37.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 701/1147] blk.37.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 702/1147] blk.37.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 703/1147] blk.37.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 704/1147] blk.37.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 705/1147] blk.37.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.37.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 706/1147] blk.37.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.37.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 707/1147] blk.37.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.37.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 708/1147] blk.37.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 709/1147] blk.38.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 710/1147] blk.38.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 711/1147] blk.38.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.38.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 712/1147] blk.38.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.38.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 713/1147] blk.38.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.38.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 714/1147] blk.38.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 715/1147] blk.38.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 716/1147] blk.38.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 717/1147] blk.38.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.38.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 718/1147] blk.38.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 719/1147] blk.38.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.38.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 720/1147] blk.38.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 721/1147] blk.38.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 722/1147] blk.38.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 723/1147] blk.38.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 724/1147] blk.38.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.38.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 725/1147] blk.38.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.38.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 726/1147] blk.38.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.38.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 727/1147] blk.38.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 728/1147] blk.39.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 729/1147] blk.39.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 730/1147] blk.39.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.39.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 731/1147] blk.39.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.39.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 732/1147] blk.39.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.39.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 733/1147] blk.39.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 734/1147] blk.39.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 735/1147] blk.39.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 736/1147] blk.39.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.39.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 737/1147] blk.39.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 738/1147] blk.39.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.39.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 739/1147] blk.39.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 740/1147] blk.39.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 741/1147] blk.39.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 742/1147] blk.39.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 743/1147] blk.39.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.39.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 744/1147] blk.39.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.39.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 745/1147] blk.39.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.39.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 746/1147] blk.39.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 747/1147] blk.40.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 748/1147] blk.40.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 749/1147] blk.40.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.40.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 750/1147] blk.40.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.40.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 751/1147] blk.40.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.40.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 752/1147] blk.40.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 753/1147] blk.40.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 754/1147] blk.40.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 755/1147] blk.40.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.40.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 756/1147] blk.40.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 757/1147] blk.40.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.40.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 758/1147] blk.40.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 759/1147] blk.40.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 760/1147] blk.40.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 761/1147] blk.40.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 762/1147] blk.40.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.40.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 763/1147] blk.40.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.40.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 764/1147] blk.40.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.40.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 765/1147] blk.40.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 766/1147] blk.41.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 767/1147] blk.41.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 768/1147] blk.41.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.41.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 769/1147] blk.41.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.41.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 770/1147] blk.41.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.41.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 771/1147] blk.41.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 772/1147] blk.41.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 773/1147] blk.41.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 774/1147] blk.41.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.41.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 775/1147] blk.41.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 776/1147] blk.41.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.41.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 777/1147] blk.41.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 778/1147] blk.41.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 779/1147] blk.41.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 780/1147] blk.41.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 781/1147] blk.41.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.41.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 782/1147] blk.41.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.41.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 783/1147] blk.41.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.41.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 784/1147] blk.41.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 785/1147] blk.42.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 786/1147] blk.42.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 787/1147] blk.42.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.42.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 788/1147] blk.42.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.42.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 789/1147] blk.42.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.42.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 790/1147] blk.42.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 791/1147] blk.42.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 792/1147] blk.42.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 793/1147] blk.42.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.42.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 794/1147] blk.42.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 795/1147] blk.42.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.42.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 796/1147] blk.42.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 797/1147] blk.42.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 798/1147] blk.42.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 799/1147] blk.42.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 800/1147] blk.42.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.42.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 801/1147] blk.42.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.42.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 802/1147] blk.42.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.42.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 803/1147] blk.42.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 804/1147] blk.43.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 805/1147] blk.43.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 806/1147] blk.43.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.43.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 807/1147] blk.43.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.43.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 808/1147] blk.43.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.43.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 809/1147] blk.43.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 810/1147] blk.43.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 811/1147] blk.43.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 812/1147] blk.43.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.43.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 813/1147] blk.43.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 814/1147] blk.43.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.43.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 815/1147] blk.43.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 816/1147] blk.43.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 817/1147] blk.43.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 818/1147] blk.43.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 819/1147] blk.43.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.43.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 820/1147] blk.43.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.43.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 821/1147] blk.43.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.43.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 822/1147] blk.43.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 823/1147] blk.44.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 824/1147] blk.44.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 825/1147] blk.44.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.44.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 826/1147] blk.44.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.44.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 827/1147] blk.44.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.44.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 828/1147] blk.44.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 829/1147] blk.44.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 830/1147] blk.44.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 831/1147] blk.44.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.44.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 832/1147] blk.44.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 833/1147] blk.44.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.44.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 834/1147] blk.44.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 835/1147] blk.44.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 836/1147] blk.44.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 837/1147] blk.44.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 838/1147] blk.44.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.44.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 839/1147] blk.44.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.44.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 840/1147] blk.44.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.44.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 841/1147] blk.44.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 842/1147] blk.45.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 843/1147] blk.45.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 844/1147] blk.45.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.45.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 845/1147] blk.45.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.45.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 846/1147] blk.45.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.45.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 847/1147] blk.45.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 848/1147] blk.45.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 849/1147] blk.45.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 850/1147] blk.45.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.45.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 851/1147] blk.45.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 852/1147] blk.45.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.45.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 853/1147] blk.45.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 854/1147] blk.45.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 855/1147] blk.45.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 856/1147] blk.45.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 857/1147] blk.45.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.45.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 858/1147] blk.45.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.45.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 859/1147] blk.45.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.45.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 860/1147] blk.45.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 861/1147] blk.46.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 862/1147] blk.46.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 863/1147] blk.46.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.46.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 864/1147] blk.46.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.46.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 865/1147] blk.46.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.46.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 866/1147] blk.46.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 867/1147] blk.46.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 868/1147] blk.46.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 869/1147] blk.46.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.46.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 870/1147] blk.46.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 871/1147] blk.46.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.46.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 872/1147] blk.46.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 873/1147] blk.46.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 874/1147] blk.46.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 875/1147] blk.46.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 876/1147] blk.46.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.46.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 877/1147] blk.46.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.46.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 878/1147] blk.46.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.46.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 879/1147] blk.46.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 880/1147] blk.47.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 881/1147] blk.47.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 882/1147] blk.47.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.47.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 883/1147] blk.47.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.47.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 884/1147] blk.47.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.47.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 885/1147] blk.47.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 886/1147] blk.47.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 887/1147] blk.47.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 888/1147] blk.47.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.47.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 889/1147] blk.47.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 890/1147] blk.47.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.47.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 891/1147] blk.47.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 892/1147] blk.47.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 893/1147] blk.47.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 894/1147] blk.47.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 895/1147] blk.47.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.47.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 896/1147] blk.47.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.47.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 897/1147] blk.47.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.47.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 898/1147] blk.47.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 899/1147] blk.48.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 900/1147] blk.48.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 901/1147] blk.48.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.48.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 902/1147] blk.48.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.48.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 903/1147] blk.48.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.48.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 904/1147] blk.48.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 905/1147] blk.48.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 906/1147] blk.48.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 907/1147] blk.48.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.48.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 908/1147] blk.48.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 909/1147] blk.48.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.48.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 910/1147] blk.48.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 911/1147] blk.48.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 912/1147] blk.48.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 913/1147] blk.48.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 914/1147] blk.48.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.48.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 915/1147] blk.48.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.48.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 916/1147] blk.48.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.48.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 917/1147] blk.48.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 918/1147] blk.49.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 919/1147] blk.49.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 920/1147] blk.49.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.49.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 921/1147] blk.49.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.49.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 922/1147] blk.49.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.49.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 923/1147] blk.49.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 924/1147] blk.49.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 925/1147] blk.49.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 926/1147] blk.49.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.49.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 927/1147] blk.49.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 928/1147] blk.49.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.49.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 929/1147] blk.49.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 930/1147] blk.49.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 931/1147] blk.49.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 932/1147] blk.49.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 933/1147] blk.49.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.49.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 934/1147] blk.49.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.49.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 935/1147] blk.49.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.49.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 936/1147] blk.49.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 937/1147] blk.50.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 938/1147] blk.50.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 939/1147] blk.50.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.50.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 940/1147] blk.50.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.50.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 941/1147] blk.50.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.50.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 942/1147] blk.50.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 943/1147] blk.50.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 944/1147] blk.50.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 945/1147] blk.50.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.50.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 946/1147] blk.50.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 947/1147] blk.50.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.50.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 948/1147] blk.50.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 949/1147] blk.50.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 950/1147] blk.50.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 951/1147] blk.50.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 952/1147] blk.50.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.50.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 953/1147] blk.50.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.50.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 954/1147] blk.50.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.50.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 955/1147] blk.50.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 956/1147] blk.51.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 957/1147] blk.51.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 958/1147] blk.51.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.51.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 959/1147] blk.51.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.51.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 960/1147] blk.51.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.51.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 961/1147] blk.51.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 962/1147] blk.51.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 963/1147] blk.51.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 964/1147] blk.51.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.51.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 965/1147] blk.51.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 966/1147] blk.51.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.51.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 967/1147] blk.51.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 968/1147] blk.51.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 969/1147] blk.51.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 970/1147] blk.51.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 971/1147] blk.51.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.51.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 972/1147] blk.51.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.51.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 973/1147] blk.51.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.51.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 974/1147] blk.51.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 975/1147] blk.52.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 976/1147] blk.52.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 977/1147] blk.52.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.52.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 978/1147] blk.52.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.52.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 979/1147] blk.52.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.52.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 980/1147] blk.52.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[ 981/1147] blk.52.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[ 982/1147] blk.52.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[ 983/1147] blk.52.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.52.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[ 984/1147] blk.52.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[ 985/1147] blk.52.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.52.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[ 986/1147] blk.52.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[ 987/1147] blk.52.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[ 988/1147] blk.52.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[ 989/1147] blk.52.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 990/1147] blk.52.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.52.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[ 991/1147] blk.52.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.52.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 992/1147] blk.52.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.52.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[ 993/1147] blk.52.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[ 994/1147] blk.53.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[ 995/1147] blk.53.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[ 996/1147] blk.53.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.53.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 997/1147] blk.53.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.53.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 998/1147] blk.53.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.53.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[ 999/1147] blk.53.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1000/1147] blk.53.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1001/1147] blk.53.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1002/1147] blk.53.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.53.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1003/1147] blk.53.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1004/1147] blk.53.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.53.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1005/1147] blk.53.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1006/1147] blk.53.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1007/1147] blk.53.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1008/1147] blk.53.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1009/1147] blk.53.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.53.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1010/1147] blk.53.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.53.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1011/1147] blk.53.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.53.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1012/1147] blk.53.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1013/1147] blk.54.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1014/1147] blk.54.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1015/1147] blk.54.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.54.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1016/1147] blk.54.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.54.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1017/1147] blk.54.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.54.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1018/1147] blk.54.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1019/1147] blk.54.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1020/1147] blk.54.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1021/1147] blk.54.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.54.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1022/1147] blk.54.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1023/1147] blk.54.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.54.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1024/1147] blk.54.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1025/1147] blk.54.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1026/1147] blk.54.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1027/1147] blk.54.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1028/1147] blk.54.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.54.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1029/1147] blk.54.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.54.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1030/1147] blk.54.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.54.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1031/1147] blk.54.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1032/1147] blk.55.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1033/1147] blk.55.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1034/1147] blk.55.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.55.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1035/1147] blk.55.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.55.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1036/1147] blk.55.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.55.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1037/1147] blk.55.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1038/1147] blk.55.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1039/1147] blk.55.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1040/1147] blk.55.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.55.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1041/1147] blk.55.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1042/1147] blk.55.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.55.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1043/1147] blk.55.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1044/1147] blk.55.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1045/1147] blk.55.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1046/1147] blk.55.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1047/1147] blk.55.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.55.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1048/1147] blk.55.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.55.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1049/1147] blk.55.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.55.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1050/1147] blk.55.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1051/1147] blk.56.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1052/1147] blk.56.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1053/1147] blk.56.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.56.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1054/1147] blk.56.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.56.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1055/1147] blk.56.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.56.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1056/1147] blk.56.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1057/1147] blk.56.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1058/1147] blk.56.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1059/1147] blk.56.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.56.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1060/1147] blk.56.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1061/1147] blk.56.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.56.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1062/1147] blk.56.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1063/1147] blk.56.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1064/1147] blk.56.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1065/1147] blk.56.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1066/1147] blk.56.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.56.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1067/1147] blk.56.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.56.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1068/1147] blk.56.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.56.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1069/1147] blk.56.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1070/1147] blk.57.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1071/1147] blk.57.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1072/1147] blk.57.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.57.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1073/1147] blk.57.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.57.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1074/1147] blk.57.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.57.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1075/1147] blk.57.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1076/1147] blk.57.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1077/1147] blk.57.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1078/1147] blk.57.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.57.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1079/1147] blk.57.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1080/1147] blk.57.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.57.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1081/1147] blk.57.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1082/1147] blk.57.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1083/1147] blk.57.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1084/1147] blk.57.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1085/1147] blk.57.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.57.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1086/1147] blk.57.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.57.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1087/1147] blk.57.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.57.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1088/1147] blk.57.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1089/1147] blk.58.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1090/1147] blk.58.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1091/1147] blk.58.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.58.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1092/1147] blk.58.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.58.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1093/1147] blk.58.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.58.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1094/1147] blk.58.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1095/1147] blk.58.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1096/1147] blk.58.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1097/1147] blk.58.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.58.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1098/1147] blk.58.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1099/1147] blk.58.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.58.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1100/1147] blk.58.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1101/1147] blk.58.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1102/1147] blk.58.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1103/1147] blk.58.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1104/1147] blk.58.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.58.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1105/1147] blk.58.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.58.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1106/1147] blk.58.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.58.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1107/1147] blk.58.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1108/1147] blk.59.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1109/1147] blk.59.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1110/1147] blk.59.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.59.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1111/1147] blk.59.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.59.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1112/1147] blk.59.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.59.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1113/1147] blk.59.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1114/1147] blk.59.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1115/1147] blk.59.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1116/1147] blk.59.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.59.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1117/1147] blk.59.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1118/1147] blk.59.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.59.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1119/1147] blk.59.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1120/1147] blk.59.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1121/1147] blk.59.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1122/1147] blk.59.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1123/1147] blk.59.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.59.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1124/1147] blk.59.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.59.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1125/1147] blk.59.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.59.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1126/1147] blk.59.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1127/1147] blk.60.exp_probs_b.bias - [ 256, 1, 1, 1], type = f32, size = 0.001 MB
[1128/1147] blk.60.ffn_gate_inp.weight - [ 7168, 256, 1, 1], type = f32, size = 7.000 MB
[1129/1147] blk.60.ffn_down_shexp.weight - [ 2048, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.60.ffn_down_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1130/1147] blk.60.ffn_gate_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.60.ffn_gate_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1131/1147] blk.60.ffn_up_shexp.weight - [ 7168, 2048, 1, 1], type = f16, Using custom type q6_K for tensor blk.60.ffn_up_shexp.weight
converting to q6_K .. size = 28.00 MiB -> 11.48 MiB
[1132/1147] blk.60.attn_kv_a_norm.weight - [ 512, 1, 1, 1], type = f32, size = 0.002 MB
[1133/1147] blk.60.attn_kv_a_mqa.weight - [ 7168, 576, 1, 1], type = f16, converting to q6_K .. size = 7.88 MiB -> 3.23 MiB
[1134/1147] blk.60.attn_kv_b.weight - [ 512, 32768, 1, 1], type = f16, converting to q6_K .. size = 32.00 MiB -> 13.12 MiB
[1135/1147] blk.60.attn_k_b.weight - [ 128, 65536, 1, 1], type = f16,
llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0
====== llama_model_quantize_internal: did not find weights for blk.60.attn_k_b.weight
converting to q8_0 .. size = 16.00 MiB -> 8.50 MiB
[1136/1147] blk.60.attn_v_b.weight - [ 512, 16384, 1, 1], type = f16, converting to q6_K .. size = 16.00 MiB -> 6.56 MiB
[1137/1147] blk.60.attn_output.weight - [16384, 7168, 1, 1], type = f16, Using custom type q6_K for tensor blk.60.attn_output.weight
converting to q6_K .. size = 224.00 MiB -> 91.88 MiB
[1138/1147] blk.60.attn_q_a_norm.weight - [ 1536, 1, 1, 1], type = f32, size = 0.006 MB
[1139/1147] blk.60.attn_q_a.weight - [ 7168, 1536, 1, 1], type = f16, converting to q6_K .. size = 21.00 MiB -> 8.61 MiB
[1140/1147] blk.60.attn_q_b.weight - [ 1536, 24576, 1, 1], type = f16, converting to q6_K .. size = 72.00 MiB -> 29.53 MiB
[1141/1147] output.weight - [ 7168, 129280, 1, 1], type = f16, Using custom type q6_K for tensor output.weight
====== llama_model_quantize_internal: did not find weights for output.weight
converting to q6_K .. size = 1767.50 MiB -> 724.95 MiB
[1142/1147] blk.60.attn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1143/1147] blk.60.ffn_down_exps.weight - [ 2048, 7168, 256, 1], type = f16, Using custom type iq4_k for tensor blk.60.ffn_down_exps.weight
converting to iq4_k .. size = 7168.00 MiB -> 2016.00 MiB
[1144/1147] blk.60.ffn_gate_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.60.ffn_gate_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1145/1147] blk.60.ffn_up_exps.weight - [ 7168, 2048, 256, 1], type = f16, Using custom type iq3_k for tensor blk.60.ffn_up_exps.weight
converting to iq3_k .. size = 7168.00 MiB -> 1540.00 MiB
[1146/1147] blk.60.ffn_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
[1147/1147] output_norm.weight - [ 7168, 1, 1, 1], type = f32, size = 0.027 MB
llama_model_quantize_internal: model size = 1282038.27 MB
llama_model_quantize_internal: quant size = 318818.01 MB
llama_model_quantize_internal: WARNING: 61 of 785 tensor(s) required fallback quantization
👤 davidsyoung commented the 2025-03-18 at 10:41:34:
PPL run
./build/bin/llama-perplexity -m /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf -f /models/wiki.test.raw -fmoe -mla 2 -fa -ts 24/24/24/24/24/24/24/24/24/24/24/24/24/24/24/24 -c 512 -ub 512 --n-gpu-layers 999 -ot "blk\.3\.ffn_(down|gate|up)_exps\.weight|blk\.4\.ffn_(down|gate|up)_exps\.weight|blk\.5\.ffn_(down|gate|up)_exps\.weight=CUDA0" -ot "blk\.6\.ffn_(down|gate|up)_exps\.weight|blk\.7\.ffn_(down|gate|up)_exps\.weight|blk\.8\.ffn_(down|gate|up)_exps\.weight=CUDA1" -ot "blk\.9\.ffn_(down|gate|up)_exps\.weight|blk\.10\.ffn_(down|gate|up)_exps\.weight|blk\.11\.ffn_(down|gate|up)_exps\.weight|blk\.12\.ffn_(down|gate|up)_exps\.weight=CUDA2" -ot "blk\.13\.ffn_(down|gate|up)_exps\.weight|blk\.14\.ffn_(down|gate|up)_exps\.weight|blk\.15\.ffn_(down|gate|up)_exps\.weight|blk\.16\.ffn_(down|gate|up)_exps\.weight=CUDA3" -ot "blk\.17\.ffn_(down|gate|up)_exps\.weight|blk\.18\.ffn_(down|gate|up)_exps\.weight|blk\.19\.ffn_(down|gate|up)_exps\.weight|blk\.20\.ffn_(down|gate|up)_exps\.weight=CUDA4" -ot "blk\.21\.ffn_(down|gate|up)_exps\.weight|blk\.22\.ffn_(down|gate|up)_exps\.weight|blk\.23\.ffn_(down|gate|up)_exps\.weight|blk\.24\.ffn_(down|gate|up)_exps\.weight=CUDA5" -ot "blk\.25\.ffn_(down|gate|up)_exps\.weight|blk\.26\.ffn_(down|gate|up)_exps\.weight|blk\.27\.ffn_(down|gate|up)_exps\.weight|blk\.28\.ffn_(down|gate|up)_exps\.weight=CUDA6" -ot "blk\.29\.ffn_(down|gate|up)_exps\.weight|blk\.30\.ffn_(down|gate|up)_exps\.weight|blk\.31\.ffn_(down|gate|up)_exps\.weight|blk\.32\.ffn_(down|gate|up)_exps\.weight=CUDA7" -ot "blk\.33\.ffn_(down|gate|up)_exps\.weight|blk\.34\.ffn_(down|gate|up)_exps\.weight|blk\.35\.ffn_(down|gate|up)_exps\.weight|blk\.36\.ffn_(down|gate|up)_exps\.weight=CUDA8" -ot "blk\.37\.ffn_(down|gate|up)_exps\.weight|blk\.38\.ffn_(down|gate|up)_exps\.weight|blk\.39\.ffn_(down|gate|up)_exps\.weight|blk\.40\.ffn_(down|gate|up)_exps\.weight=CUDA9" -ot "blk\.41\.ffn_(down|gate|up)_exps\.weight|blk\.42\.ffn_(down|gate|up)_exps\.weight|blk\.43\.ffn_(down|gate|up)_exps\.weight|blk\.44\.ffn_(down|gate|up)_exps\.weight=CUDA10" -ot "blk\.45\.ffn_(down|gate|up)_exps\.weight|blk\.46\.ffn_(down|gate|up)_exps\.weight|blk\.47\.ffn_(down|gate|up)_exps\.weight|blk\.48\.ffn_(down|gate|up)_exps\.weight=CUDA11" -ot "blk\.49\.ffn_(down|gate|up)_exps\.weight|blk\.50\.ffn_(down|gate|up)_exps\.weight|blk\.51\.ffn_(down|gate|up)_exps\.weight|blk\.52\.ffn_(down|gate|up)_exps\.weight=CUDA12" -ot "blk\.53\.ffn_(down|gate|up)_exps\.weight|blk\.54\.ffn_(down|gate|up)_exps\.weight|blk\.55\.ffn_(down|gate|up)_exps\.weight|blk\.56\.ffn_(down|gate|up)_exps\.weight=CUDA13" -ot "blk\.57\.ffn_(down|gate|up)_exps\.weight|blk\.58\.ffn_(down|gate|up)_exps\.weight|blk\.59\.ffn_(down|gate|up)_exps\.weight|blk\.60\.ffn_(down|gate|up)_exps\.weight=CUDA14" --seed 1741529602 --temp 0.5
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 16 CUDA devices:
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 3: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 4: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 5: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 6: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 7: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 8: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 9: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 10: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 11: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 12: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 13: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 14: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 15: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed = 1741529602
llama_model_loader: loaded meta data with 54 key-value pairs and 1147 tensors from /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = deepseek2
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = unsloth_DeepSeek R1 BF16
llama_model_loader: - kv 3: general.size_label str = 256x21B
llama_model_loader: - kv 4: general.license str = mit
llama_model_loader: - kv 5: general.base_model.count u32 = 1
llama_model_loader: - kv 6: general.base_model.0.name str = DeepSeek R1
llama_model_loader: - kv 7: general.base_model.0.organization str = Deepseek Ai
llama_model_loader: - kv 8: general.base_model.0.repo_url str = https://huggingface.co/deepseek-ai/De...
llama_model_loader: - kv 9: general.tags arr[str,3] = ["deepseek", "unsloth", "transformers"]
llama_model_loader: - kv 10: general.languages arr[str,1] = ["en"]
llama_model_loader: - kv 11: deepseek2.block_count u32 = 61
llama_model_loader: - kv 12: deepseek2.context_length u32 = 163840
llama_model_loader: - kv 13: deepseek2.embedding_length u32 = 7168
llama_model_loader: - kv 14: deepseek2.feed_forward_length u32 = 18432
llama_model_loader: - kv 15: deepseek2.attention.head_count u32 = 128
llama_model_loader: - kv 16: deepseek2.attention.head_count_kv u32 = 128
llama_model_loader: - kv 17: deepseek2.rope.freq_base f32 = 10000.000000
llama_model_loader: - kv 18: deepseek2.attention.layer_norm_rms_epsilon f32 = 0.000001
llama_model_loader: - kv 19: deepseek2.expert_used_count u32 = 8
llama_model_loader: - kv 20: general.file_type u32 = 18
llama_model_loader: - kv 21: deepseek2.leading_dense_block_count u32 = 3
llama_model_loader: - kv 22: deepseek2.vocab_size u32 = 129280
llama_model_loader: - kv 23: deepseek2.attention.q_lora_rank u32 = 1536
llama_model_loader: - kv 24: deepseek2.attention.kv_lora_rank u32 = 512
llama_model_loader: - kv 25: deepseek2.attention.key_length u32 = 192
llama_model_loader: - kv 26: deepseek2.attention.value_length u32 = 128
llama_model_loader: - kv 27: deepseek2.expert_feed_forward_length u32 = 2048
llama_model_loader: - kv 28: deepseek2.expert_count u32 = 256
llama_model_loader: - kv 29: deepseek2.expert_shared_count u32 = 1
llama_model_loader: - kv 30: deepseek2.expert_weights_scale f32 = 2.500000
llama_model_loader: - kv 31: deepseek2.expert_weights_norm bool = true
llama_model_loader: - kv 32: deepseek2.expert_gating_func u32 = 2
llama_model_loader: - kv 33: deepseek2.rope.dimension_count u32 = 64
llama_model_loader: - kv 34: deepseek2.rope.scaling.type str = yarn
llama_model_loader: - kv 35: deepseek2.rope.scaling.factor f32 = 40.000000
llama_model_loader: - kv 36: deepseek2.rope.scaling.original_context_length u32 = 4096
llama_model_loader: - kv 37: deepseek2.rope.scaling.yarn_log_multiplier f32 = 0.100000
llama_model_loader: - kv 38: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 39: tokenizer.ggml.pre str = deepseek-v3
llama_model_loader: - kv 40: tokenizer.ggml.tokens arr[str,129280] = ["<|begin▁of▁sentence|>", "<<3C>...
llama_model_loader: - kv 41: tokenizer.ggml.token_type arr[i32,129280] = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 42: tokenizer.ggml.merges arr[str,127741] = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv 43: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 44: tokenizer.ggml.eos_token_id u32 = 1
llama_model_loader: - kv 45: tokenizer.ggml.padding_token_id u32 = 128815
llama_model_loader: - kv 46: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 47: tokenizer.ggml.add_eos_token bool = false
llama_model_loader: - kv 48: tokenizer.chat_template str = {% if not add_generation_prompt is de...
llama_model_loader: - kv 49: general.quantization_version u32 = 2
llama_model_loader: - kv 50: quantize.imatrix.file str = /models/deepseek-config/imatrix.dat
llama_model_loader: - kv 51: quantize.imatrix.dataset str = imatrix-training-full-3
llama_model_loader: - kv 52: quantize.imatrix.entries_count i32 = 720
llama_model_loader: - kv 53: quantize.imatrix.chunks_count i32 = 315
llama_model_loader: - type f32: 361 tensors
llama_model_loader: - type q8_0: 62 tensors
llama_model_loader: - type q5_K: 6 tensors
llama_model_loader: - type q6_K: 550 tensors
llama_model_loader: - type iq3_k: 104 tensors
llama_model_loader: - type iq4_k: 64 tensors
llm_load_vocab: special tokens cache size = 819
llm_load_vocab: token to piece cache size = 0.8223 MB
llm_load_print_meta: format = GGUF V3 (latest)
llm_load_print_meta: arch = deepseek2
llm_load_print_meta: vocab type = BPE
llm_load_print_meta: n_vocab = 129280
llm_load_print_meta: n_merges = 127741
llm_load_print_meta: vocab_only = 0
llm_load_print_meta: n_ctx_train = 163840
llm_load_print_meta: n_embd = 7168
llm_load_print_meta: n_layer = 61
llm_load_print_meta: n_head = 128
llm_load_print_meta: n_head_kv = 128
llm_load_print_meta: n_rot = 64
llm_load_print_meta: n_swa = 0
llm_load_print_meta: n_embd_head_k = 192
llm_load_print_meta: n_embd_head_v = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: n_embd_k_gqa = 24576
llm_load_print_meta: n_embd_v_gqa = 16384
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-06
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale = 0.0e+00
llm_load_print_meta: n_ff = 18432
llm_load_print_meta: n_expert = 256
llm_load_print_meta: n_expert_used = 8
llm_load_print_meta: causal attn = 1
llm_load_print_meta: pooling type = 0
llm_load_print_meta: rope type = 0
llm_load_print_meta: rope scaling = yarn
llm_load_print_meta: freq_base_train = 10000.0
llm_load_print_meta: freq_scale_train = 0.025
llm_load_print_meta: n_ctx_orig_yarn = 4096
llm_load_print_meta: rope_finetuned = unknown
llm_load_print_meta: ssm_d_conv = 0
llm_load_print_meta: ssm_d_inner = 0
llm_load_print_meta: ssm_d_state = 0
llm_load_print_meta: ssm_dt_rank = 0
llm_load_print_meta: model type = 671B
llm_load_print_meta: model ftype = Q6_K
llm_load_print_meta: model params = 672.050 B
llm_load_print_meta: model size = 311.346 GiB (3.980 BPW)
llm_load_print_meta: repeating layers = 309.721 GiB (3.970 BPW, 670.196 B parameters)
llm_load_print_meta: general.name = unsloth_DeepSeek R1 BF16
llm_load_print_meta: BOS token = 0 '<|begin▁of▁sentence|>'
llm_load_print_meta: EOS token = 1 '<|end▁of▁sentence|>'
llm_load_print_meta: PAD token = 128815 '<|PAD▁TOKEN|>'
llm_load_print_meta: LF token = 131 'Ä'
llm_load_print_meta: max token length = 256
llm_load_print_meta: n_layer_dense_lead = 3
llm_load_print_meta: n_lora_q = 1536
llm_load_print_meta: n_lora_kv = 512
llm_load_print_meta: n_ff_exp = 2048
llm_load_print_meta: n_expert_shared = 1
llm_load_print_meta: expert_weights_scale = 2.5
llm_load_print_meta: expert_weights_norm = 1
llm_load_print_meta: expert_gating_func = sigmoid
llm_load_print_meta: rope_yarn_log_mul = 0.1000
llm_load_tensors: ggml ctx size = 7.94 MiB
Tensor blk.3.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.6.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.9.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.13.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.17.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.21.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.25.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.29.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.33.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.37.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.41.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.45.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.49.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.53.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.57.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_up_exps.weight buffer type overriden to CUDA14
llm_load_tensors: offloading 61 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 62/62 layers to GPU
llm_load_tensors: CPU buffer size = 938.98 MiB
llm_load_tensors: CUDA0 buffer size = 21105.69 MiB
llm_load_tensors: CUDA1 buffer size = 20299.82 MiB
llm_load_tensors: CUDA2 buffer size = 21195.82 MiB
llm_load_tensors: CUDA3 buffer size = 21195.82 MiB
llm_load_tensors: CUDA4 buffer size = 21195.82 MiB
llm_load_tensors: CUDA5 buffer size = 21195.82 MiB
llm_load_tensors: CUDA6 buffer size = 21195.82 MiB
llm_load_tensors: CUDA7 buffer size = 20992.86 MiB
llm_load_tensors: CUDA8 buffer size = 21195.82 MiB
llm_load_tensors: CUDA9 buffer size = 21195.82 MiB
llm_load_tensors: CUDA10 buffer size = 21195.82 MiB
llm_load_tensors: CUDA11 buffer size = 21195.82 MiB
llm_load_tensors: CUDA12 buffer size = 21195.82 MiB
llm_load_tensors: CUDA13 buffer size = 21195.82 MiB
llm_load_tensors: CUDA14 buffer size = 21195.82 MiB
llm_load_tensors: CUDA15 buffer size = 1130.89 MiB
....................................................................................................
llama_new_context_with_model: n_ctx = 2048
llama_new_context_with_model: n_batch = 2048
llama_new_context_with_model: n_ubatch = 512
llama_new_context_with_model: flash_attn = 1
llama_new_context_with_model: mla_attn = 2
llama_new_context_with_model: attn_max_b = 0
llama_new_context_with_model: fused_moe = 1
llama_new_context_with_model: ser = -1, 0
llama_new_context_with_model: freq_base = 10000.0
llama_new_context_with_model: freq_scale = 0.025
llama_kv_cache_init: layer 0: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 1: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 2: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 3: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 4: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 5: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 6: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 7: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 8: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 9: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 10: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 11: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 12: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 13: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 14: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 15: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 16: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 17: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 18: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 19: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 20: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 21: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 22: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 23: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 24: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 25: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 26: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 27: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 28: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 29: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 30: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 31: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 32: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 33: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 34: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 35: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 36: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 37: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 38: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 39: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 40: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 41: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 42: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 43: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 44: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 45: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 46: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 47: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 48: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 49: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 50: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 51: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 52: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 53: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 54: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 55: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 56: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 57: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 58: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 59: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 60: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: CUDA0 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA1 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA2 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA3 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA4 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA5 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA6 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA7 KV buffer size = 6.75 MiB
llama_kv_cache_init: CUDA8 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA9 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA10 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA11 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA12 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA13 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA14 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA15 KV buffer size = 4.50 MiB
llama_new_context_with_model: KV self size = 137.25 MiB, c^KV (f16): 137.25 MiB, kv^T: not used
llama_new_context_with_model: CUDA_Host output buffer size = 1.97 MiB
llama_new_context_with_model: CUDA0 compute buffer size = 455.00 MiB
llama_new_context_with_model: CUDA1 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA2 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA3 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA4 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA5 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA6 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA7 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA8 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA9 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA10 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA11 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA12 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA13 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA14 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA15 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 18.01 MiB
llama_new_context_with_model: graph nodes = 3487
llama_new_context_with_model: graph splits = 65
system_info: n_threads = 64 / 128 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
perplexity: tokenizing the input ..
perplexity: tokenization took 1167.48 ms
perplexity: calculating perplexity over 561 chunks, n_ctx=512, batch_size=2048, n_seq=4
perplexity: 22.93 seconds per pass - ETA 53.58 minutes
[1]2.5633,[2]3.3137,[3]nan,[4]nan,[5]nan,[6]nan,[7]nan,[8]nan,[9]nan,[10]nan,[11]nan,[12]nan,[13]nan,[14]nan,[15]nan,[16]nan,[17]nan,[18]nan,[19]nan,[20]nan,[21]nan,[22]nan,[23]nan,[24]nan,[25]nan,[26]nan,[27]nan,[28]nan,[29]nan,[30]nan,[31]nan,[32]nan,[33]nan,[34]nan,[35]nan,[36]nan,[37]nan,[38]nan,[39]nan,[40]nan,[41]nan,[42]nan,[43]nan,[44]nan,[45]nan,[46]nan,[47]nan,[48]nan,[49]nan,[50]nan,[51]nan,[52]nan,[53]nan,[54]nan,[55]nan,[56]nan,[57]nan,[58]nan,[59]nan,[60]nan,[61]nan,[62]nan,[63]nan,[64]nan,[65]nan,[66]nan,[67]nan,[68]nan,[69]nan,[70]nan,[71]nan,[72]nan,[73]nan,[74]nan,[75]nan,[76]nan,[77]nan,[78]nan,[79]nan,[80]nan,[81]nan,[82]nan,[83]nan,[84]nan,[85]nan,[86]nan,[87]nan,[88]nan,[89]nan,[90]nan,[91]nan,[92]nan,^C
👤 ikawrakow commented the 2025-03-18 at 10:45:08:
Did you enable the GGML_CUDA_IQK_FORCE_BF16 option when building?
👤 davidsyoung commented the 2025-03-18 at 10:49:14:
D'oh. Back to the drawing board. Apologies! Will report back.
👤 davidsyoung commented the 2025-03-18 at 13:11:13:
Works! Great work!
Successful PPL run
root@f9b3ae98b5a1:/app# ./build/bin/llama-perplexity -m /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf -f /models/wiki.test.raw -fmoe -mla 2 -fa -ts 24/24/24/24/24/24/24/24/24/24/24/24/24/24/24/24 -c 512 -ub 512 --n-gpu-layers 999 -ot "blk\.3\.ffn_(down|gate|up)_exps\.weight|blk\.4\.ffn_(down|gate|up)_exps\.weight|blk\.5\.ffn_(down|gate|up)_exps\.weight=CUDA0" -ot "blk\.6\.ffn_(down|gate|up)_exps\.weight|blk\.7\.ffn_(down|gate|up)_exps\.weight|blk\.8\.ffn_(down|gate|up)_exps\.weight=CUDA1" -ot "blk\.9\.ffn_(down|gate|up)_exps\.weight|blk\.10\.ffn_(down|gate|up)_exps\.weight|blk\.11\.ffn_(down|gate|up)_exps\.weight|blk\.12\.ffn_(down|gate|up)_exps\.weight=CUDA2" -ot "blk\.13\.ffn_(down|gate|up)_exps\.weight|blk\.14\.ffn_(down|gate|up)_exps\.weight|blk\.15\.ffn_(down|gate|up)_exps\.weight|blk\.16\.ffn_(down|gate|up)_exps\.weight=CUDA3" -ot "blk\.17\.ffn_(down|gate|up)_exps\.weight|blk\.18\.ffn_(down|gate|up)_exps\.weight|blk\.19\.ffn_(down|gate|up)_exps\.weight|blk\.20\.ffn_(down|gate|up)_exps\.weight=CUDA4" -ot "blk\.21\.ffn_(down|gate|up)_exps\.weight|blk\.22\.ffn_(down|gate|up)_exps\.weight|blk\.23\.ffn_(down|gate|up)_exps\.weight|blk\.24\.ffn_(down|gate|up)_exps\.weight=CUDA5" -ot "blk\.25\.ffn_(down|gate|up)_exps\.weight|blk\.26\.ffn_(down|gate|up)_exps\.weight|blk\.27\.ffn_(down|gate|up)_exps\.weight|blk\.28\.ffn_(down|gate|up)_exps\.weight=CUDA6" -ot "blk\.29\.ffn_(down|gate|up)_exps\.weight|blk\.30\.ffn_(down|gate|up)_exps\.weight|blk\.31\.ffn_(down|gate|up)_exps\.weight|blk\.32\.ffn_(down|gate|up)_exps\.weight=CUDA7" -ot "blk\.33\.ffn_(down|gate|up)_exps\.weight|blk\.34\.ffn_(down|gate|up)_exps\.weight|blk\.35\.ffn_(down|gate|up)_exps\.weight|blk\.36\.ffn_(down|gate|up)_exps\.weight=CUDA8" -ot "blk\.37\.ffn_(down|gate|up)_exps\.weight|blk\.38\.ffn_(down|gate|up)_exps\.weight|blk\.39\.ffn_(down|gate|up)_exps\.weight|blk\.40\.ffn_(down|gate|up)_exps\.weight=CUDA9" -ot "blk\.41\.ffn_(down|gate|up)_exps\.weight|blk\.42\.ffn_(down|gate|up)_exps\.weight|blk\.43\.ffn_(down|gate|up)_exps\.weight|blk\.44\.ffn_(down|gate|up)_exps\.weight=CUDA10" -ot "blk\.45\.ffn_(down|gate|up)_exps\.weight|blk\.46\.ffn_(down|gate|up)_exps\.weight|blk\.47\.ffn_(down|gate|up)_exps\.weight|blk\.48\.ffn_(down|gate|up)_exps\.weight=CUDA11" -ot "blk\.49\.ffn_(down|gate|up)_exps\.weight|blk\.50\.ffn_(down|gate|up)_exps\.weight|blk\.51\.ffn_(down|gate|up)_exps\.weight|blk\.52\.ffn_(down|gate|up)_exps\.weight=CUDA12" -ot "blk\.53\.ffn_(down|gate|up)_exps\.weight|blk\.54\.ffn_(down|gate|up)_exps\.weight|blk\.55\.ffn_(down|gate|up)_exps\.weight|blk\.56\.ffn_(down|gate|up)_exps\.weight=CUDA13" -ot "blk\.57\.ffn_(down|gate|up)_exps\.weight|blk\.58\.ffn_(down|gate|up)_exps\.weight|blk\.59\.ffn_(down|gate|up)_exps\.weight|blk\.60\.ffn_(down|gate|up)_exps\.weight=CUDA14" --seed 1741529602 --temp 0.5
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 16 CUDA devices:
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 3: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 4: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 5: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 6: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 7: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 8: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 9: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 10: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 11: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 12: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 13: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 14: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
Device 15: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed = 1741529602
llama_model_loader: loaded meta data with 54 key-value pairs and 1147 tensors from /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = deepseek2
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = unsloth_DeepSeek R1 BF16
llama_model_loader: - kv 3: general.size_label str = 256x21B
llama_model_loader: - kv 4: general.license str = mit
llama_model_loader: - kv 5: general.base_model.count u32 = 1
llama_model_loader: - kv 6: general.base_model.0.name str = DeepSeek R1
llama_model_loader: - kv 7: general.base_model.0.organization str = Deepseek Ai
llama_model_loader: - kv 8: general.base_model.0.repo_url str = https://huggingface.co/deepseek-ai/De...
llama_model_loader: - kv 9: general.tags arr[str,3] = ["deepseek", "unsloth", "transformers"]
llama_model_loader: - kv 10: general.languages arr[str,1] = ["en"]
llama_model_loader: - kv 11: deepseek2.block_count u32 = 61
llama_model_loader: - kv 12: deepseek2.context_length u32 = 163840
llama_model_loader: - kv 13: deepseek2.embedding_length u32 = 7168
llama_model_loader: - kv 14: deepseek2.feed_forward_length u32 = 18432
llama_model_loader: - kv 15: deepseek2.attention.head_count u32 = 128
llama_model_loader: - kv 16: deepseek2.attention.head_count_kv u32 = 128
llama_model_loader: - kv 17: deepseek2.rope.freq_base f32 = 10000.000000
llama_model_loader: - kv 18: deepseek2.attention.layer_norm_rms_epsilon f32 = 0.000001
llama_model_loader: - kv 19: deepseek2.expert_used_count u32 = 8
llama_model_loader: - kv 20: general.file_type u32 = 18
llama_model_loader: - kv 21: deepseek2.leading_dense_block_count u32 = 3
llama_model_loader: - kv 22: deepseek2.vocab_size u32 = 129280
llama_model_loader: - kv 23: deepseek2.attention.q_lora_rank u32 = 1536
llama_model_loader: - kv 24: deepseek2.attention.kv_lora_rank u32 = 512
llama_model_loader: - kv 25: deepseek2.attention.key_length u32 = 192
llama_model_loader: - kv 26: deepseek2.attention.value_length u32 = 128
llama_model_loader: - kv 27: deepseek2.expert_feed_forward_length u32 = 2048
llama_model_loader: - kv 28: deepseek2.expert_count u32 = 256
llama_model_loader: - kv 29: deepseek2.expert_shared_count u32 = 1
llama_model_loader: - kv 30: deepseek2.expert_weights_scale f32 = 2.500000
llama_model_loader: - kv 31: deepseek2.expert_weights_norm bool = true
llama_model_loader: - kv 32: deepseek2.expert_gating_func u32 = 2
llama_model_loader: - kv 33: deepseek2.rope.dimension_count u32 = 64
llama_model_loader: - kv 34: deepseek2.rope.scaling.type str = yarn
llama_model_loader: - kv 35: deepseek2.rope.scaling.factor f32 = 40.000000
llama_model_loader: - kv 36: deepseek2.rope.scaling.original_context_length u32 = 4096
llama_model_loader: - kv 37: deepseek2.rope.scaling.yarn_log_multiplier f32 = 0.100000
llama_model_loader: - kv 38: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 39: tokenizer.ggml.pre str = deepseek-v3
llama_model_loader: - kv 40: tokenizer.ggml.tokens arr[str,129280] = ["<|begin▁of▁sentence|>", "<<3C>...
llama_model_loader: - kv 41: tokenizer.ggml.token_type arr[i32,129280] = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 42: tokenizer.ggml.merges arr[str,127741] = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv 43: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 44: tokenizer.ggml.eos_token_id u32 = 1
llama_model_loader: - kv 45: tokenizer.ggml.padding_token_id u32 = 128815
llama_model_loader: - kv 46: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 47: tokenizer.ggml.add_eos_token bool = false
llama_model_loader: - kv 48: tokenizer.chat_template str = {% if not add_generation_prompt is de...
llama_model_loader: - kv 49: general.quantization_version u32 = 2
llama_model_loader: - kv 50: quantize.imatrix.file str = /models/deepseek-config/imatrix.dat
llama_model_loader: - kv 51: quantize.imatrix.dataset str = imatrix-training-full-3
llama_model_loader: - kv 52: quantize.imatrix.entries_count i32 = 720
llama_model_loader: - kv 53: quantize.imatrix.chunks_count i32 = 315
llama_model_loader: - type f32: 361 tensors
llama_model_loader: - type q8_0: 62 tensors
llama_model_loader: - type q5_K: 6 tensors
llama_model_loader: - type q6_K: 550 tensors
llama_model_loader: - type iq3_k: 104 tensors
llama_model_loader: - type iq4_k: 64 tensors
llm_load_vocab: special tokens cache size = 819
llm_load_vocab: token to piece cache size = 0.8223 MB
llm_load_print_meta: format = GGUF V3 (latest)
llm_load_print_meta: arch = deepseek2
llm_load_print_meta: vocab type = BPE
llm_load_print_meta: n_vocab = 129280
llm_load_print_meta: n_merges = 127741
llm_load_print_meta: vocab_only = 0
llm_load_print_meta: n_ctx_train = 163840
llm_load_print_meta: n_embd = 7168
llm_load_print_meta: n_layer = 61
llm_load_print_meta: n_head = 128
llm_load_print_meta: n_head_kv = 128
llm_load_print_meta: n_rot = 64
llm_load_print_meta: n_swa = 0
llm_load_print_meta: n_embd_head_k = 192
llm_load_print_meta: n_embd_head_v = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: n_embd_k_gqa = 24576
llm_load_print_meta: n_embd_v_gqa = 16384
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-06
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale = 0.0e+00
llm_load_print_meta: n_ff = 18432
llm_load_print_meta: n_expert = 256
llm_load_print_meta: n_expert_used = 8
llm_load_print_meta: causal attn = 1
llm_load_print_meta: pooling type = 0
llm_load_print_meta: rope type = 0
llm_load_print_meta: rope scaling = yarn
llm_load_print_meta: freq_base_train = 10000.0
llm_load_print_meta: freq_scale_train = 0.025
llm_load_print_meta: n_ctx_orig_yarn = 4096
llm_load_print_meta: rope_finetuned = unknown
llm_load_print_meta: ssm_d_conv = 0
llm_load_print_meta: ssm_d_inner = 0
llm_load_print_meta: ssm_d_state = 0
llm_load_print_meta: ssm_dt_rank = 0
llm_load_print_meta: model type = 671B
llm_load_print_meta: model ftype = Q6_K
llm_load_print_meta: model params = 672.050 B
llm_load_print_meta: model size = 311.346 GiB (3.980 BPW)
llm_load_print_meta: repeating layers = 309.721 GiB (3.970 BPW, 670.196 B parameters)
llm_load_print_meta: general.name = unsloth_DeepSeek R1 BF16
llm_load_print_meta: BOS token = 0 '<|begin▁of▁sentence|>'
llm_load_print_meta: EOS token = 1 '<|end▁of▁sentence|>'
llm_load_print_meta: PAD token = 128815 '<|PAD▁TOKEN|>'
llm_load_print_meta: LF token = 131 'Ä'
llm_load_print_meta: max token length = 256
llm_load_print_meta: n_layer_dense_lead = 3
llm_load_print_meta: n_lora_q = 1536
llm_load_print_meta: n_lora_kv = 512
llm_load_print_meta: n_ff_exp = 2048
llm_load_print_meta: n_expert_shared = 1
llm_load_print_meta: expert_weights_scale = 2.5
llm_load_print_meta: expert_weights_norm = 1
llm_load_print_meta: expert_gating_func = sigmoid
llm_load_print_meta: rope_yarn_log_mul = 0.1000
llm_load_tensors: ggml ctx size = 7.94 MiB
Tensor blk.3.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.6.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.9.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.13.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.17.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.21.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.25.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.29.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.33.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.37.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.41.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.45.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.49.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.53.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.57.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_up_exps.weight buffer type overriden to CUDA14
llm_load_tensors: offloading 61 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 62/62 layers to GPU
llm_load_tensors: CPU buffer size = 938.98 MiB
llm_load_tensors: CUDA0 buffer size = 21105.69 MiB
llm_load_tensors: CUDA1 buffer size = 20299.82 MiB
llm_load_tensors: CUDA2 buffer size = 21195.82 MiB
llm_load_tensors: CUDA3 buffer size = 21195.82 MiB
llm_load_tensors: CUDA4 buffer size = 21195.82 MiB
llm_load_tensors: CUDA5 buffer size = 21195.82 MiB
llm_load_tensors: CUDA6 buffer size = 21195.82 MiB
llm_load_tensors: CUDA7 buffer size = 20992.86 MiB
llm_load_tensors: CUDA8 buffer size = 21195.82 MiB
llm_load_tensors: CUDA9 buffer size = 21195.82 MiB
llm_load_tensors: CUDA10 buffer size = 21195.82 MiB
llm_load_tensors: CUDA11 buffer size = 21195.82 MiB
llm_load_tensors: CUDA12 buffer size = 21195.82 MiB
llm_load_tensors: CUDA13 buffer size = 21195.82 MiB
llm_load_tensors: CUDA14 buffer size = 21195.82 MiB
llm_load_tensors: CUDA15 buffer size = 1130.89 MiB
....................................................................................................
llama_new_context_with_model: n_ctx = 2048
llama_new_context_with_model: n_batch = 2048
llama_new_context_with_model: n_ubatch = 512
llama_new_context_with_model: flash_attn = 1
llama_new_context_with_model: mla_attn = 2
llama_new_context_with_model: attn_max_b = 0
llama_new_context_with_model: fused_moe = 1
llama_new_context_with_model: ser = -1, 0
llama_new_context_with_model: freq_base = 10000.0
llama_new_context_with_model: freq_scale = 0.025
llama_kv_cache_init: layer 0: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 1: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 2: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 3: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 4: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 5: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 6: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 7: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 8: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 9: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 10: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 11: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 12: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 13: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 14: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 15: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 16: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 17: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 18: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 19: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 20: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 21: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 22: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 23: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 24: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 25: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 26: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 27: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 28: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 29: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 30: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 31: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 32: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 33: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 34: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 35: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 36: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 37: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 38: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 39: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 40: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 41: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 42: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 43: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 44: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 45: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 46: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 47: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 48: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 49: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 50: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 51: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 52: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 53: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 54: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 55: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 56: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 57: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 58: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 59: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 60: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: CUDA0 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA1 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA2 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA3 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA4 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA5 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA6 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA7 KV buffer size = 6.75 MiB
llama_kv_cache_init: CUDA8 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA9 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA10 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA11 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA12 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA13 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA14 KV buffer size = 9.00 MiB
llama_kv_cache_init: CUDA15 KV buffer size = 4.50 MiB
llama_new_context_with_model: KV self size = 137.25 MiB, c^KV (f16): 137.25 MiB, kv^T: not used
llama_new_context_with_model: CUDA_Host output buffer size = 1.97 MiB
llama_new_context_with_model: CUDA0 compute buffer size = 455.00 MiB
llama_new_context_with_model: CUDA1 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA2 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA3 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA4 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA5 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA6 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA7 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA8 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA9 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA10 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA11 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA12 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA13 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA14 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA15 compute buffer size = 476.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 18.01 MiB
llama_new_context_with_model: graph nodes = 3487
llama_new_context_with_model: graph splits = 65
system_info: n_threads = 64 / 128 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
perplexity: tokenizing the input ..
perplexity: tokenization took 1206.19 ms
perplexity: calculating perplexity over 561 chunks, n_ctx=512, batch_size=2048, n_seq=4
perplexity: 18.19 seconds per pass - ETA 42.52 minutes
[1]2.5586,[2]3.3168,[3]2.3936,[4]2.0039,[5]1.8142,[6]1.6702,[7]1.5775,[8]1.5075,[9]1.4595,[10]1.4201,[11]1.4070,[12]1.4370,[13]1.4496,[14]1.5781,[15]1.7114,[16]1.7697,[17]1.9326,[18]2.0621,[19]2.0268,[20]2.0155,[21]2.1268,[22]2.1007,[23]2.0732,[24]2.0870,[25]2.0591,[26]2.0345,[27]2.0835,[28]2.0909,[29]2.1415,[30]2.1734,[31]2.2082,[32]2.2263,[33]2.2655,[34]2.3099,[35]2.3604,[36]2.4126,[37]2.4474,[38]2.4946,[39]2.5347,[40]2.5951,[41]2.6395,[42]2.6509,[43]2.7003,[44]2.7157,[45]2.7953,[46]2.8467,[47]2.8028,[48]2.7555,[49]2.7335,[50]2.7524,[51]2.7983,[52]2.8127,[53]2.8648,[54]2.8779,[55]2.9093,[56]2.9418,[57]2.9568,[58]2.9947,[59]3.0068,[60]3.0541,[61]3.0963,[62]3.1488,[63]3.1812,[64]3.2268,[65]3.2363,[66]3.2208,[67]3.1983,[68]3.2313,[69]3.2274,[70]3.2453,[71]3.2632,[72]3.2785,[73]3.2926,[74]3.3158,[75]3.2951,[76]3.2468,[77]3.2033,[78]3.1978,[79]3.1752,[80]3.1585,[81]3.1220,[82]3.1263,[83]3.0939,[84]3.0572,[85]3.0226,[86]2.9979,[87]2.9913,[88]2.9620,[89]2.9446,[90]2.9189,[91]2.8900,[92]2.8640,[93]2.8371,[94]2.8118,[95]2.7877,[96]2.7867,[97]2.7933,[98]2.7781,[99]2.7607,[100]2.7637,[101]2.7557,[102]2.7735,[103]2.8013,[104]2.8209,[105]2.8182,[106]2.8415,[107]2.8660,[108]2.8867,[109]2.9204,[110]2.9543,[111]2.9739,[112]2.9475,[113]2.9354,[114]2.9137,[115]2.8968,[116]2.8835,[117]2.8601,[118]2.8389,[119]2.8175,[120]2.7984,[121]2.7821,[122]2.7637,[123]2.7472,[124]2.7279,[125]2.7105,[126]2.6936,[127]2.6807,[128]2.6722,[129]2.6625,[130]2.6503,[131]2.6437,[132]2.6510,[133]2.6603,[134]2.6677,[135]2.6788,[136]2.6953,[137]2.7118,[138]2.7200,[139]2.7321,[140]2.7326,[141]2.7339,[142]2.7328,[143]2.7330,[144]2.7294,[145]2.7202,[146]2.7186,[147]2.7227,[148]2.7223,[149]2.7235,[150]2.7177,[151]2.7155,[152]2.7123,[153]2.7083,[154]2.7085,[155]2.7128,[156]2.7143,[157]2.7202,[158]2.7292,[159]2.7310,[160]2.7400,[161]2.7484,[162]2.7577,[163]2.7627,[164]2.7832,[165]2.8070,[166]2.8242,[167]2.8363,[168]2.8608,[169]2.8836,[170]2.9055,[171]2.9290,[172]2.9125,[173]2.8952,[174]2.8821,[175]2.8691,[176]2.8562,[177]2.8448,[178]2.8317,[179]2.8177,[180]2.8216,[181]2.8357,[182]2.8509,[183]2.8654,[184]2.8795,[185]2.8898,[186]2.9065,[187]2.9222,[188]2.9363,[189]2.9468,[190]2.9471,[191]2.9542,[192]2.9580,[193]2.9635,[194]2.9830,[195]2.9918,[196]3.0051,[197]3.0148,[198]3.0188,[199]3.0241,[200]3.0233,[201]3.0385,[202]3.0337,[203]3.0388,[204]3.0422,[205]3.0420,[206]3.0443,[207]3.0531,[208]3.0634,[209]3.0728,[210]3.0731,[211]3.0685,[212]3.0687,[213]3.0762,[214]3.0784,[215]3.0843,[216]3.0848,[217]3.0807,[218]3.0809,[219]3.0820,[220]3.0810,[221]3.0813,[222]3.0813,[223]3.0817,[224]3.0867,[225]3.0886,[226]3.0803,[227]3.0781,[228]3.0803,[229]3.0848,[230]3.0911,[231]3.0974,[232]3.0887,[233]3.0807,[234]3.0811,[235]3.0794,[236]3.0880,[237]3.0963,[238]3.1062,[239]3.1160,[240]3.1252,[241]3.1364,[242]3.1510,[243]3.1645,[244]3.1728,[245]3.1840,[246]3.1943,[247]3.1934,[248]3.1892,[249]3.1868,[250]3.1800,[251]3.1777,[252]3.1798,[253]3.1834,[254]3.1905,[255]3.1969,[256]3.2006,[257]3.2030,[258]3.2042,[259]3.2075,[260]3.2096,[261]3.2107,[262]3.2097,[263]3.2156,[264]3.2179,[265]3.2182,[266]3.2202,[267]3.2229,[268]3.2268,[269]3.2301,[270]3.2294,[271]3.2280,[272]3.2208,[273]3.2207,[274]3.2138,[275]3.2031,[276]3.1926,[277]3.1942,[278]3.2046,[279]3.2111,[280]3.2190,[281]3.2263,[282]3.2325,[283]3.2389,[284]3.2453,[285]3.2590,[286]3.2612,[287]3.2646,[288]3.2693,[289]3.2719,[290]3.2635,[291]3.2541,[292]3.2530,[293]3.2522,[294]3.2497,[295]3.2474,[296]3.2496,[297]3.2501,[298]3.2552,[299]3.2615,[300]3.2648,[301]3.2686,[302]3.2711,[303]3.2731,[304]3.2725,[305]3.2842,[306]3.2918,[307]3.3028,[308]3.2915,[309]3.2864,[310]3.2767,[311]3.2808,[312]3.2834,[313]3.2905,[314]3.2929,[315]3.2962,[316]3.2976,[317]3.2993,[318]3.2998,[319]3.3001,[320]3.3043,[321]3.3045,[322]3.3063,[323]3.3130,[324]3.3136,[325]3.3189,[326]3.3235,[327]3.3277,[328]3.3307,[329]3.3323,[330]3.3385,[331]3.3425,[332]3.3470,[333]3.3456,[334]3.3455,[335]3.3460,[336]3.3460,[337]3.3469,[338]3.3473,[339]3.3498,[340]3.3535,[341]3.3589,[342]3.3677,[343]3.3770,[344]3.3823,[345]3.3740,[346]3.3661,[347]3.3608,[348]3.3532,[349]3.3494,[350]3.3475,[351]3.3521,[352]3.3673,[353]3.3764,[354]3.3896,[355]3.3983,[356]3.4036,[357]3.4156,[358]3.4254,[359]3.4285,[360]3.4348,[361]3.4443,[362]3.4531,[363]3.4589,[364]3.4653,[365]3.4718,[366]3.4826,[367]3.4915,[368]3.4983,[369]3.5063,[370]3.5149,[371]3.5286,[372]3.5377,[373]3.5409,[374]3.5444,[375]3.5493,[376]3.5624,[377]3.5738,[378]3.5767,[379]3.5761,[380]3.5727,
[381]3.5775,[382]3.5833,[383]3.5868,[384]3.5911,[385]3.5948,[386]3.6008,[387]3.6066,[388]3.6099,[389]3.5991,[390]3.5896,[391]3.5787,[392]3.5730,[393]3.5635,[394]3.5542,[395]3.5450,[396]3.5347,[397]3.5257,[398]3.5160,[399]3.5056,[400]3.4975,[401]3.4874,[402]3.4768,[403]3.4678,[404]3.4573,[405]3.4476,[406]3.4375,[407]3.4281,[408]3.4192,[409]3.4103,[410]3.4041,[411]3.4049,[412]3.4004,[413]3.4022,[414]3.4041,[415]3.4011,[416]3.4013,[417]3.4034,[418]3.3976,[419]3.3989,[420]3.3964,[421]3.3953,[422]3.3969,[423]3.3962,[424]3.4001,[425]3.3995,[426]3.4001,[427]3.3991,[428]3.4014,[429]3.4032,[430]3.4061,[431]3.4070,[432]3.4063,[433]3.4025,[434]3.4028,[435]3.3954,[436]3.3891,[437]3.3850,[438]3.3831,[439]3.3805,[440]3.3854,[441]3.3908,[442]3.3981,[443]3.3963,[444]3.3969,[445]3.3980,[446]3.4029,[447]3.4060,[448]3.4086,[449]3.4116,[450]3.4154,[451]3.4183,[452]3.4205,[453]3.4224,[454]3.4208,[455]3.4229,[456]3.4232,[457]3.4260,[458]3.4311,[459]3.4317,[460]3.4318,[461]3.4283,[462]3.4320,[463]3.4395,[464]3.4450,[465]3.4379,[466]3.4361,[467]3.4345,[468]3.4359,[469]3.4330,[470]3.4301,[471]3.4306,[472]3.4314,[473]3.4306,[474]3.4296,[475]3.4307,[476]3.4293,[477]3.4283,[478]3.4291,[479]3.4308,[480]3.4335,[481]3.4294,[482]3.4328,[483]3.4319,[484]3.4355,[485]3.4418,[486]3.4448,[487]3.4487,[488]3.4541,[489]3.4566,[490]3.4610,[491]3.4673,[492]3.4718,[493]3.4714,[494]3.4726,[495]3.4751,[496]3.4770,[497]3.4799,[498]3.4802,[499]3.4795,[500]3.4835,[501]3.4880,[502]3.4873,[503]3.4858,[504]3.4878,[505]3.4910,[506]3.4996,[507]3.5024,[508]3.5058,[509]3.4982,[510]3.4929,[511]3.4865,[512]3.4821,[513]3.4759,[514]3.4746,[515]3.4771,[516]3.4721,[517]3.4719,[518]3.4709,[519]3.4716,[520]3.4765,[521]3.4751,[522]3.4736,[523]3.4794,[524]3.4783,[525]3.4766,[526]3.4719,[527]3.4668,[528]3.4634,[529]3.4601,[530]3.4570,[531]3.4539,[532]3.4481,[533]3.4416,[534]3.4376,[535]3.4384,[536]3.4414,[537]3.4444,[538]3.4472,[539]3.4499,[540]3.4553,[541]3.4589,[542]3.4613,[543]3.4555,[544]3.4514,[545]3.4510,[546]3.4443,[547]3.4380,[548]3.4314,[549]3.4247,[550]3.4187,[551]3.4125,[552]3.4067,[553]3.4010,[554]3.3990,[555]3.3976,[556]3.4004,[557]3.4045,[558]3.4104,[559]3.4150,[560]3.4202,[561]3.4184,
Final estimate: PPL = 3.4184 +/- 0.01902