ik_llama.cpp/262 - Fix _261.md at main - ik_llama.cpp

ikawrakow/ik_llama.cpp

Fork 0

mirror of https://github.com/ikawrakow/ik_llama.cpp.git synced 2026-01-26 09:09:50 +00:00

Files

Thomas eaa2510a28 Add GitHub data: filename sanitization (#640 )

2025-07-23 13:31:53 +02:00

298 KiB

Raw Permalink Blame History

🐛 #262 - Fix #261

Author	`ikawrakow`
State	❌ Closed
Created	2025-03-18
Updated	2025-03-18

💬 Conversation

👤 davidsyoung commented the 2025-03-18 at 10:41:29:

Unfortunately still getting NaNs under perplexity. I built the latest PR in regards q8_0 KV cache.

Quant command

./build/bin/llama-quantize --imatrix /models/deepseek-config/imatrix.dat \
  --token-embedding-type q8_0 \
  --attn-q-type q6_K \
  --attn-k-type q6_K \
  --attn-v-type q6_K \
  --attn-qkv-type q6_K \
  --attn-output-type q6_K \
  --ffn-gate-type q6_K \
  --ffn-down-type q6_K \
  --ffn-up-type q6_K \
  --custom-q "\.ffn_.*_shexp\.weight=q6_K,output\.weight=q6_K" \
  --custom-q "blk\.3\.ffn_down_exps\.weight=q5_K,blk\.4\.ffn_down_exps\.weight=q5_K,blk\.5\.ffn_down_exps\.weight=q5_K,blk\.3\.ffn_up_exps\.weight=iq4_k,blk\.3\.ffn_gate_exps\.weight=iq4_k,blk\.4\.ffn_up_exps\.weight=iq4_k,blk\.4\.ffn_gate_exps\.weight=iq4_k,blk\.5\.ffn_up_exps\.weight=iq4_k,blk\.5\.ffn_gate_exps\.weight=iq4_k" \
  --custom-q "blk\.6\.ffn_down_exps\.weight=q5_K,blk\.7\.ffn_down_exps\.weight=q5_K,blk\.8\.ffn_down_exps\.weight=q5_K,blk\.6\.ffn_up_exps\.weight=iq4_k,blk\.6\.ffn_gate_exps\.weight=iq4_k,blk\.7\.ffn_up_exps\.weight=iq4_k,blk\.7\.ffn_gate_exps\.weight=iq4_k,blk\.8\.ffn_up_exps\.weight=iq4_k,blk\.8\.ffn_gate_exps\.weight=iq4_k" \
  --custom-q "blk\.9\.ffn_down_exps\.weight=iq4_k,blk\.10\.ffn_down_exps\.weight=iq4_k,blk\.11\.ffn_down_exps\.weight=iq4_k,blk\.12\.ffn_down_exps\.weight=iq4_k,blk\.9\.ffn_up_exps\.weight=iq3_k,blk\.9\.ffn_gate_exps\.weight=iq3_k,blk\.10\.ffn_up_exps\.weight=iq3_k,blk\.10\.ffn_gate_exps\.weight=iq3_k,blk\.11\.ffn_up_exps\.weight=iq3_k,blk\.11\.ffn_gate_exps\.weight=iq3_k,blk\.12\.ffn_up_exps\.weight=iq3_k,blk\.12\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.13\.ffn_down_exps\.weight=iq4_k,blk\.14\.ffn_down_exps\.weight=iq4_k,blk\.15\.ffn_down_exps\.weight=iq4_k,blk\.16\.ffn_down_exps\.weight=iq4_k,blk\.13\.ffn_up_exps\.weight=iq3_k,blk\.13\.ffn_gate_exps\.weight=iq3_k,blk\.14\.ffn_up_exps\.weight=iq3_k,blk\.14\.ffn_gate_exps\.weight=iq3_k,blk\.15\.ffn_up_exps\.weight=iq3_k,blk\.15\.ffn_gate_exps\.weight=iq3_k,blk\.16\.ffn_up_exps\.weight=iq3_k,blk\.16\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.17\.ffn_down_exps\.weight=iq4_k,blk\.18\.ffn_down_exps\.weight=iq4_k,blk\.19\.ffn_down_exps\.weight=iq4_k,blk\.20\.ffn_down_exps\.weight=iq4_k,blk\.17\.ffn_up_exps\.weight=iq3_k,blk\.17\.ffn_gate_exps\.weight=iq3_k,blk\.18\.ffn_up_exps\.weight=iq3_k,blk\.18\.ffn_gate_exps\.weight=iq3_k,blk\.19\.ffn_up_exps\.weight=iq3_k,blk\.19\.ffn_gate_exps\.weight=iq3_k,blk\.20\.ffn_up_exps\.weight=iq3_k,blk\.20\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.21\.ffn_down_exps\.weight=iq4_k,blk\.22\.ffn_down_exps\.weight=iq4_k,blk\.23\.ffn_down_exps\.weight=iq4_k,blk\.24\.ffn_down_exps\.weight=iq4_k,blk\.21\.ffn_up_exps\.weight=iq3_k,blk\.21\.ffn_gate_exps\.weight=iq3_k,blk\.22\.ffn_up_exps\.weight=iq3_k,blk\.22\.ffn_gate_exps\.weight=iq3_k,blk\.23\.ffn_up_exps\.weight=iq3_k,blk\.23\.ffn_gate_exps\.weight=iq3_k,blk\.24\.ffn_up_exps\.weight=iq3_k,blk\.24\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.25\.ffn_down_exps\.weight=iq4_k,blk\.26\.ffn_down_exps\.weight=iq4_k,blk\.27\.ffn_down_exps\.weight=iq4_k,blk\.28\.ffn_down_exps\.weight=iq4_k,blk\.25\.ffn_up_exps\.weight=iq3_k,blk\.25\.ffn_gate_exps\.weight=iq3_k,blk\.26\.ffn_up_exps\.weight=iq3_k,blk\.26\.ffn_gate_exps\.weight=iq3_k,blk\.27\.ffn_up_exps\.weight=iq3_k,blk\.27\.ffn_gate_exps\.weight=iq3_k,blk\.28\.ffn_up_exps\.weight=iq3_k,blk\.28\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.29\.ffn_down_exps\.weight=iq4_k,blk\.30\.ffn_down_exps\.weight=iq4_k,blk\.31\.ffn_down_exps\.weight=iq4_k,blk\.32\.ffn_down_exps\.weight=iq4_k,blk\.29\.ffn_up_exps\.weight=iq3_k,blk\.29\.ffn_gate_exps\.weight=iq3_k,blk\.30\.ffn_up_exps\.weight=iq3_k,blk\.30\.ffn_gate_exps\.weight=iq3_k,blk\.31\.ffn_up_exps\.weight=iq3_k,blk\.31\.ffn_gate_exps\.weight=iq3_k,blk\.32\.ffn_up_exps\.weight=iq3_k,blk\.32\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.33\.ffn_down_exps\.weight=iq4_k,blk\.34\.ffn_down_exps\.weight=iq4_k,blk\.35\.ffn_down_exps\.weight=iq4_k,blk\.36\.ffn_down_exps\.weight=iq4_k,blk\.33\.ffn_up_exps\.weight=iq3_k,blk\.33\.ffn_gate_exps\.weight=iq3_k,blk\.34\.ffn_up_exps\.weight=iq3_k,blk\.34\.ffn_gate_exps\.weight=iq3_k,blk\.35\.ffn_up_exps\.weight=iq3_k,blk\.35\.ffn_gate_exps\.weight=iq3_k,blk\.36\.ffn_up_exps\.weight=iq3_k,blk\.36\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.37\.ffn_down_exps\.weight=iq4_k,blk\.38\.ffn_down_exps\.weight=iq4_k,blk\.39\.ffn_down_exps\.weight=iq4_k,blk\.40\.ffn_down_exps\.weight=iq4_k,blk\.37\.ffn_up_exps\.weight=iq3_k,blk\.37\.ffn_gate_exps\.weight=iq3_k,blk\.38\.ffn_up_exps\.weight=iq3_k,blk\.38\.ffn_gate_exps\.weight=iq3_k,blk\.39\.ffn_up_exps\.weight=iq3_k,blk\.39\.ffn_gate_exps\.weight=iq3_k,blk\.40\.ffn_up_exps\.weight=iq3_k,blk\.40\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.41\.ffn_down_exps\.weight=iq4_k,blk\.42\.ffn_down_exps\.weight=iq4_k,blk\.43\.ffn_down_exps\.weight=iq4_k,blk\.44\.ffn_down_exps\.weight=iq4_k,blk\.41\.ffn_up_exps\.weight=iq3_k,blk\.41\.ffn_gate_exps\.weight=iq3_k,blk\.42\.ffn_up_exps\.weight=iq3_k,blk\.42\.ffn_gate_exps\.weight=iq3_k,blk\.43\.ffn_up_exps\.weight=iq3_k,blk\.43\.ffn_gate_exps\.weight=iq3_k,blk\.44\.ffn_up_exps\.weight=iq3_k,blk\.44\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.45\.ffn_down_exps\.weight=iq4_k,blk\.46\.ffn_down_exps\.weight=iq4_k,blk\.47\.ffn_down_exps\.weight=iq4_k,blk\.48\.ffn_down_exps\.weight=iq4_k,blk\.45\.ffn_up_exps\.weight=iq3_k,blk\.45\.ffn_gate_exps\.weight=iq3_k,blk\.46\.ffn_up_exps\.weight=iq3_k,blk\.46\.ffn_gate_exps\.weight=iq3_k,blk\.47\.ffn_up_exps\.weight=iq3_k,blk\.47\.ffn_gate_exps\.weight=iq3_k,blk\.48\.ffn_up_exps\.weight=iq3_k,blk\.48\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.49\.ffn_down_exps\.weight=iq4_k,blk\.50\.ffn_down_exps\.weight=iq4_k,blk\.51\.ffn_down_exps\.weight=iq4_k,blk\.52\.ffn_down_exps\.weight=iq4_k,blk\.49\.ffn_up_exps\.weight=iq3_k,blk\.49\.ffn_gate_exps\.weight=iq3_k,blk\.50\.ffn_up_exps\.weight=iq3_k,blk\.50\.ffn_gate_exps\.weight=iq3_k,blk\.51\.ffn_up_exps\.weight=iq3_k,blk\.51\.ffn_gate_exps\.weight=iq3_k,blk\.52\.ffn_up_exps\.weight=iq3_k,blk\.52\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.53\.ffn_down_exps\.weight=iq4_k,blk\.54\.ffn_down_exps\.weight=iq4_k,blk\.55\.ffn_down_exps\.weight=iq4_k,blk\.56\.ffn_down_exps\.weight=iq4_k,blk\.53\.ffn_up_exps\.weight=iq3_k,blk\.53\.ffn_gate_exps\.weight=iq3_k,blk\.54\.ffn_up_exps\.weight=iq3_k,blk\.54\.ffn_gate_exps\.weight=iq3_k,blk\.55\.ffn_up_exps\.weight=iq3_k,blk\.55\.ffn_gate_exps\.weight=iq3_k,blk\.56\.ffn_up_exps\.weight=iq3_k,blk\.56\.ffn_gate_exps\.weight=iq3_k" \
  --custom-q "blk\.57\.ffn_down_exps\.weight=iq4_k,blk\.58\.ffn_down_exps\.weight=iq4_k,blk\.59\.ffn_down_exps\.weight=iq4_k,blk\.60\.ffn_down_exps\.weight=iq4_k,blk\.57\.ffn_up_exps\.weight=iq3_k,blk\.57\.ffn_gate_exps\.weight=iq3_k,blk\.58\.ffn_up_exps\.weight=iq3_k,blk\.58\.ffn_gate_exps\.weight=iq3_k,blk\.59\.ffn_up_exps\.weight=iq3_k,blk\.59\.ffn_gate_exps\.weight=iq3_k,blk\.60\.ffn_up_exps\.weight=iq3_k,blk\.60\.ffn_gate_exps\.weight=iq3_k" \
  /storage/DeepSeek-R1-GGUF/unsloth_DeepSeek-R1-BF16-256x21B-F16-00001-of-00059.gguf \
  /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf \
  q6_K 6

Quant command output

Adding custom rule \.ffn_.*_shexp\.weight -> q6_K
Adding custom rule output\.weight -> q6_K
Adding custom rule blk\.3\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.4\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.5\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.3\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.3\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.4\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.4\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.5\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.5\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.6\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.7\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.8\.ffn_down_exps\.weight -> q5_K
Adding custom rule blk\.6\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.6\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.7\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.7\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.8\.ffn_up_exps\.weight -> iq4_k
Adding custom rule blk\.8\.ffn_gate_exps\.weight -> iq4_k
Adding custom rule blk\.9\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.10\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.11\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.12\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.9\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.9\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.10\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.10\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.11\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.11\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.12\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.12\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.13\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.14\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.15\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.16\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.13\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.13\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.14\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.14\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.15\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.15\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.16\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.16\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.17\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.18\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.19\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.20\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.17\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.17\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.18\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.18\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.19\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.19\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.20\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.20\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.21\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.22\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.23\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.24\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.21\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.21\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.22\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.22\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.23\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.23\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.24\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.24\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.25\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.26\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.27\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.28\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.25\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.25\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.26\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.26\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.27\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.27\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.28\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.28\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.29\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.30\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.31\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.32\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.29\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.29\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.30\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.30\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.31\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.31\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.32\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.32\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.33\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.34\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.35\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.36\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.33\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.33\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.34\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.34\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.35\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.35\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.36\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.36\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.37\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.38\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.39\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.40\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.37\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.37\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.38\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.38\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.39\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.39\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.40\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.40\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.41\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.42\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.43\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.44\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.41\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.41\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.42\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.42\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.43\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.43\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.44\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.44\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.45\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.46\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.47\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.48\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.45\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.45\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.46\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.46\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.47\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.47\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.48\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.48\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.49\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.50\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.51\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.52\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.49\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.49\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.50\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.50\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.51\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.51\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.52\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.52\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.53\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.54\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.55\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.56\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.53\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.53\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.54\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.54\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.55\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.55\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.56\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.56\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.57\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.58\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.59\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.60\.ffn_down_exps\.weight -> iq4_k
Adding custom rule blk\.57\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.57\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.58\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.58\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.59\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.59\.ffn_gate_exps\.weight -> iq3_k
Adding custom rule blk\.60\.ffn_up_exps\.weight -> iq3_k
Adding custom rule blk\.60\.ffn_gate_exps\.weight -> iq3_k
load_imatrix: imatrix dataset='imatrix-training-full-3'
load_imatrix: loaded 720 importance matrix entries from /models/deepseek-config/imatrix.dat computed on 315 chunks
prepare_imatrix: have 720 importance matrix entries
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '/storage/DeepSeek-R1-GGUF/unsloth_DeepSeek-R1-BF16-256x21B-F16-00001-of-00059.gguf' to '/models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf' as Q6_K using 64 threads
llama_model_loader: additional 58 GGUFs metadata loaded.
llama_model_loader: loaded meta data with 53 key-value pairs and 1147 tensors from /storage/DeepSeek-R1-GGUF/unsloth_DeepSeek-R1-BF16-256x21B-F16-00001-of-00059.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = deepseek2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = unsloth_DeepSeek R1 BF16
llama_model_loader: - kv   3:                         general.size_label str              = 256x21B
llama_model_loader: - kv   4:                            general.license str              = mit
llama_model_loader: - kv   5:                   general.base_model.count u32              = 1
llama_model_loader: - kv   6:                  general.base_model.0.name str              = DeepSeek R1
llama_model_loader: - kv   7:          general.base_model.0.organization str              = Deepseek Ai
llama_model_loader: - kv   8:              general.base_model.0.repo_url str              = https://huggingface.co/deepseek-ai/De...
llama_model_loader: - kv   9:                               general.tags arr[str,3]       = ["deepseek", "unsloth", "transformers"]
llama_model_loader: - kv  10:                          general.languages arr[str,1]       = ["en"]
llama_model_loader: - kv  11:                      deepseek2.block_count u32              = 61
llama_model_loader: - kv  12:                   deepseek2.context_length u32              = 163840
llama_model_loader: - kv  13:                 deepseek2.embedding_length u32              = 7168
llama_model_loader: - kv  14:              deepseek2.feed_forward_length u32              = 18432
llama_model_loader: - kv  15:             deepseek2.attention.head_count u32              = 128
llama_model_loader: - kv  16:          deepseek2.attention.head_count_kv u32              = 128
llama_model_loader: - kv  17:                   deepseek2.rope.freq_base f32              = 10000.000000
llama_model_loader: - kv  18: deepseek2.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  19:                deepseek2.expert_used_count u32              = 8
llama_model_loader: - kv  20:                          general.file_type u32              = 1
llama_model_loader: - kv  21:        deepseek2.leading_dense_block_count u32              = 3
llama_model_loader: - kv  22:                       deepseek2.vocab_size u32              = 129280
llama_model_loader: - kv  23:            deepseek2.attention.q_lora_rank u32              = 1536
llama_model_loader: - kv  24:           deepseek2.attention.kv_lora_rank u32              = 512
llama_model_loader: - kv  25:             deepseek2.attention.key_length u32              = 192
llama_model_loader: - kv  26:           deepseek2.attention.value_length u32              = 128
llama_model_loader: - kv  27:       deepseek2.expert_feed_forward_length u32              = 2048
llama_model_loader: - kv  28:                     deepseek2.expert_count u32              = 256
llama_model_loader: - kv  29:              deepseek2.expert_shared_count u32              = 1
llama_model_loader: - kv  30:             deepseek2.expert_weights_scale f32              = 2.500000
llama_model_loader: - kv  31:              deepseek2.expert_weights_norm bool             = true
llama_model_loader: - kv  32:               deepseek2.expert_gating_func u32              = 2
llama_model_loader: - kv  33:             deepseek2.rope.dimension_count u32              = 64
llama_model_loader: - kv  34:                deepseek2.rope.scaling.type str              = yarn
llama_model_loader: - kv  35:              deepseek2.rope.scaling.factor f32              = 40.000000
llama_model_loader: - kv  36: deepseek2.rope.scaling.original_context_length u32              = 4096
llama_model_loader: - kv  37: deepseek2.rope.scaling.yarn_log_multiplier f32              = 0.100000
llama_model_loader: - kv  38:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  39:                         tokenizer.ggml.pre str              = deepseek-v3
llama_model_loader: - kv  40:                      tokenizer.ggml.tokens arr[str,129280]  = ["<｜begin▁of▁sentence｜>", "<<3C>...
llama_model_loader: - kv  41:                  tokenizer.ggml.token_type arr[i32,129280]  = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  42:                      tokenizer.ggml.merges arr[str,127741]  = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv  43:                tokenizer.ggml.bos_token_id u32              = 0
llama_model_loader: - kv  44:                tokenizer.ggml.eos_token_id u32              = 1
llama_model_loader: - kv  45:            tokenizer.ggml.padding_token_id u32              = 128815
llama_model_loader: - kv  46:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  47:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  48:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
llama_model_loader: - kv  49:               general.quantization_version u32              = 2
llama_model_loader: - kv  50:                                   split.no u16              = 0
llama_model_loader: - kv  51:                                split.count u16              = 59
llama_model_loader: - kv  52:                        split.tensors.count i32              = 1147
llama_model_loader: - type  f32:  361 tensors
llama_model_loader: - type  f16:  786 tensors
================================ Have weights data with 720 entries
[   1/1147]                    token_embd.weight - [ 7168, 129280,     1,     1], type =    f16, 
====== llama_model_quantize_internal: did not find weights for token_embd.weight
converting to q8_0 .. size =  1767.50 MiB ->   938.98 MiB
[   2/1147]               blk.0.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[   3/1147]                blk.0.ffn_down.weight - [18432,  7168,     1,     1], type =    f16, converting to q6_K .. size =   252.00 MiB ->   103.36 MiB
[   4/1147]                blk.0.ffn_gate.weight - [ 7168, 18432,     1,     1], type =    f16, converting to q6_K .. size =   252.00 MiB ->   103.36 MiB
[   5/1147]                  blk.0.ffn_up.weight - [ 7168, 18432,     1,     1], type =    f16, converting to q6_K .. size =   252.00 MiB ->   103.36 MiB
[   6/1147]                blk.0.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[   7/1147]          blk.0.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[   8/1147]           blk.0.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[   9/1147]               blk.0.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[  10/1147]                blk.0.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.0.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[  11/1147]                blk.0.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[  12/1147]             blk.0.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.0.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[  13/1147]           blk.0.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[  14/1147]                blk.0.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[  15/1147]                blk.0.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[  16/1147]               blk.1.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[  17/1147]                blk.1.ffn_down.weight - [18432,  7168,     1,     1], type =    f16, converting to q6_K .. size =   252.00 MiB ->   103.36 MiB
[  18/1147]                blk.1.ffn_gate.weight - [ 7168, 18432,     1,     1], type =    f16, converting to q6_K .. size =   252.00 MiB ->   103.36 MiB
[  19/1147]                  blk.1.ffn_up.weight - [ 7168, 18432,     1,     1], type =    f16, converting to q6_K .. size =   252.00 MiB ->   103.36 MiB
[  20/1147]                blk.1.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[  21/1147]          blk.1.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[  22/1147]           blk.1.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[  23/1147]               blk.1.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[  24/1147]                blk.1.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.1.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[  25/1147]                blk.1.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[  26/1147]             blk.1.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.1.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[  27/1147]           blk.1.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[  28/1147]                blk.1.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[  29/1147]                blk.1.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[  30/1147]               blk.2.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[  31/1147]                blk.2.ffn_down.weight - [18432,  7168,     1,     1], type =    f16, converting to q6_K .. size =   252.00 MiB ->   103.36 MiB
[  32/1147]                blk.2.ffn_gate.weight - [ 7168, 18432,     1,     1], type =    f16, converting to q6_K .. size =   252.00 MiB ->   103.36 MiB
[  33/1147]                  blk.2.ffn_up.weight - [ 7168, 18432,     1,     1], type =    f16, converting to q6_K .. size =   252.00 MiB ->   103.36 MiB
[  34/1147]                blk.2.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[  35/1147]          blk.2.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[  36/1147]           blk.2.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[  37/1147]               blk.2.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[  38/1147]                blk.2.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.2.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[  39/1147]                blk.2.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[  40/1147]             blk.2.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.2.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[  41/1147]           blk.2.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[  42/1147]                blk.2.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[  43/1147]                blk.2.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[  44/1147]               blk.3.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[  45/1147]            blk.3.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[  46/1147]          blk.3.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.3.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[  47/1147]          blk.3.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.3.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[  48/1147]            blk.3.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.3.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[  49/1147]          blk.3.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[  50/1147]           blk.3.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[  51/1147]               blk.3.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[  52/1147]                blk.3.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.3.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[  53/1147]                blk.3.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[  54/1147]             blk.3.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.3.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[  55/1147]           blk.3.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[  56/1147]                blk.3.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[  57/1147]                blk.3.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[  58/1147]               blk.3.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[  59/1147]           blk.3.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type q5_K for tensor blk.3.ffn_down_exps.weight
converting to q5_K .. size =  7168.00 MiB ->  2464.00 MiB
[  60/1147]           blk.3.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.3.ffn_gate_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[  61/1147]             blk.3.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.3.ffn_up_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[  62/1147]                blk.3.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[  63/1147]               blk.4.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[  64/1147]            blk.4.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[  65/1147]          blk.4.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.4.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[  66/1147]          blk.4.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.4.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[  67/1147]            blk.4.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.4.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[  68/1147]          blk.4.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[  69/1147]           blk.4.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[  70/1147]               blk.4.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[  71/1147]                blk.4.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.4.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[  72/1147]                blk.4.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[  73/1147]             blk.4.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.4.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[  74/1147]           blk.4.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[  75/1147]                blk.4.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[  76/1147]                blk.4.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[  77/1147]               blk.4.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[  78/1147]           blk.4.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type q5_K for tensor blk.4.ffn_down_exps.weight
converting to q5_K .. size =  7168.00 MiB ->  2464.00 MiB
[  79/1147]           blk.4.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.4.ffn_gate_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[  80/1147]             blk.4.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.4.ffn_up_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[  81/1147]                blk.4.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[  82/1147]          blk.5.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[  83/1147]           blk.5.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[  84/1147]               blk.5.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[  85/1147]                blk.5.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.5.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[  86/1147]                blk.5.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[  87/1147]             blk.5.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.5.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[  88/1147]           blk.5.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[  89/1147]                blk.5.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[  90/1147]                blk.5.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[  91/1147]               blk.5.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[  92/1147]            blk.5.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[  93/1147]          blk.5.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.5.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[  94/1147]          blk.5.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.5.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[  95/1147]            blk.5.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.5.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[  96/1147]               blk.5.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[  97/1147]           blk.5.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type q5_K for tensor blk.5.ffn_down_exps.weight
converting to q5_K .. size =  7168.00 MiB ->  2464.00 MiB
[  98/1147]           blk.5.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.5.ffn_gate_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[  99/1147]             blk.5.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.5.ffn_up_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 100/1147]                blk.5.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 101/1147]               blk.6.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 102/1147]            blk.6.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 103/1147]          blk.6.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.6.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 104/1147]          blk.6.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.6.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 105/1147]            blk.6.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.6.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 106/1147]          blk.6.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 107/1147]           blk.6.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 108/1147]               blk.6.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 109/1147]                blk.6.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.6.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 110/1147]                blk.6.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 111/1147]             blk.6.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.6.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 112/1147]           blk.6.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 113/1147]                blk.6.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 114/1147]                blk.6.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 115/1147]               blk.6.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 116/1147]           blk.6.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type q5_K for tensor blk.6.ffn_down_exps.weight
converting to q5_K .. size =  7168.00 MiB ->  2464.00 MiB
[ 117/1147]           blk.6.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.6.ffn_gate_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 118/1147]             blk.6.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.6.ffn_up_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 119/1147]                blk.6.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 120/1147]               blk.7.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 121/1147]            blk.7.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 122/1147]          blk.7.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.7.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 123/1147]          blk.7.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.7.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 124/1147]            blk.7.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.7.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 125/1147]          blk.7.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 126/1147]           blk.7.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 127/1147]               blk.7.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 128/1147]                blk.7.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.7.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 129/1147]                blk.7.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 130/1147]             blk.7.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.7.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 131/1147]           blk.7.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 132/1147]                blk.7.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 133/1147]                blk.7.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 134/1147]               blk.7.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 135/1147]           blk.7.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type q5_K for tensor blk.7.ffn_down_exps.weight
converting to q5_K .. size =  7168.00 MiB ->  2464.00 MiB
[ 136/1147]           blk.7.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.7.ffn_gate_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 137/1147]             blk.7.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.7.ffn_up_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 138/1147]                blk.7.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 139/1147]               blk.8.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 140/1147]            blk.8.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 141/1147]          blk.8.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.8.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 142/1147]          blk.8.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.8.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 143/1147]            blk.8.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.8.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 144/1147]          blk.8.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 145/1147]           blk.8.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 146/1147]               blk.8.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 147/1147]                blk.8.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.8.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 148/1147]                blk.8.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 149/1147]             blk.8.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.8.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 150/1147]           blk.8.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 151/1147]                blk.8.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 152/1147]                blk.8.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 153/1147]               blk.8.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 154/1147]           blk.8.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type q5_K for tensor blk.8.ffn_down_exps.weight
converting to q5_K .. size =  7168.00 MiB ->  2464.00 MiB
[ 155/1147]           blk.8.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.8.ffn_gate_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 156/1147]             blk.8.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.8.ffn_up_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 157/1147]                blk.8.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 158/1147]               blk.9.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 159/1147]            blk.9.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 160/1147]          blk.9.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.9.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 161/1147]          blk.9.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.9.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 162/1147]            blk.9.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.9.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 163/1147]          blk.9.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 164/1147]           blk.9.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 165/1147]               blk.9.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 166/1147]                blk.9.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.9.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 167/1147]                blk.9.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 168/1147]             blk.9.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.9.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 169/1147]           blk.9.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 170/1147]                blk.9.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 171/1147]                blk.9.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 172/1147]              blk.10.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 173/1147]           blk.10.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 174/1147]         blk.10.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.10.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 175/1147]         blk.10.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.10.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 176/1147]           blk.10.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.10.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 177/1147]         blk.10.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 178/1147]          blk.10.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 179/1147]              blk.10.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 180/1147]               blk.10.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.10.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 181/1147]               blk.10.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 182/1147]            blk.10.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.10.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 183/1147]          blk.10.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 184/1147]               blk.10.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 185/1147]               blk.10.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 186/1147]               blk.9.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 187/1147]           blk.9.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.9.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 188/1147]           blk.9.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.9.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 189/1147]             blk.9.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.9.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 190/1147]                blk.9.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 191/1147]              blk.10.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 192/1147]          blk.10.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.10.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 193/1147]          blk.10.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.10.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 194/1147]            blk.10.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.10.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 195/1147]               blk.10.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 196/1147]              blk.11.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 197/1147]           blk.11.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 198/1147]         blk.11.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.11.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 199/1147]         blk.11.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.11.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 200/1147]           blk.11.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.11.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 201/1147]         blk.11.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 202/1147]          blk.11.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 203/1147]              blk.11.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 204/1147]               blk.11.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.11.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 205/1147]               blk.11.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 206/1147]            blk.11.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.11.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 207/1147]          blk.11.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 208/1147]               blk.11.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 209/1147]               blk.11.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 210/1147]              blk.11.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 211/1147]          blk.11.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.11.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 212/1147]          blk.11.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.11.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 213/1147]            blk.11.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.11.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 214/1147]               blk.11.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 215/1147]              blk.12.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 216/1147]           blk.12.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 217/1147]         blk.12.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.12.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 218/1147]         blk.12.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.12.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 219/1147]           blk.12.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.12.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 220/1147]         blk.12.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 221/1147]          blk.12.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 222/1147]              blk.12.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 223/1147]               blk.12.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.12.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 224/1147]               blk.12.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 225/1147]            blk.12.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.12.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 226/1147]          blk.12.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 227/1147]               blk.12.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 228/1147]               blk.12.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 229/1147]              blk.12.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 230/1147]          blk.12.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.12.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 231/1147]          blk.12.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.12.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 232/1147]            blk.12.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.12.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 233/1147]               blk.12.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 234/1147]              blk.13.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 235/1147]           blk.13.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 236/1147]         blk.13.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.13.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 237/1147]         blk.13.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.13.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 238/1147]           blk.13.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.13.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 239/1147]         blk.13.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 240/1147]          blk.13.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 241/1147]              blk.13.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 242/1147]               blk.13.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.13.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 243/1147]               blk.13.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 244/1147]            blk.13.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.13.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 245/1147]          blk.13.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 246/1147]               blk.13.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 247/1147]               blk.13.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 248/1147]              blk.13.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 249/1147]          blk.13.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.13.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 250/1147]          blk.13.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.13.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 251/1147]            blk.13.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.13.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 252/1147]               blk.13.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 253/1147]              blk.14.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 254/1147]           blk.14.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 255/1147]         blk.14.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.14.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 256/1147]         blk.14.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.14.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 257/1147]           blk.14.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.14.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 258/1147]         blk.14.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 259/1147]          blk.14.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 260/1147]              blk.14.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 261/1147]               blk.14.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.14.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 262/1147]               blk.14.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 263/1147]            blk.14.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.14.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 264/1147]          blk.14.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 265/1147]               blk.14.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 266/1147]               blk.14.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 267/1147]              blk.14.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 268/1147]          blk.14.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.14.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 269/1147]          blk.14.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.14.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 270/1147]            blk.14.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.14.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 271/1147]               blk.14.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 272/1147]              blk.15.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 273/1147]           blk.15.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 274/1147]         blk.15.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.15.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 275/1147]         blk.15.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.15.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 276/1147]           blk.15.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.15.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 277/1147]         blk.15.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 278/1147]          blk.15.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 279/1147]              blk.15.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 280/1147]               blk.15.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.15.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 281/1147]               blk.15.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 282/1147]            blk.15.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.15.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 283/1147]          blk.15.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 284/1147]               blk.15.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 285/1147]               blk.15.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 286/1147]              blk.15.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 287/1147]          blk.15.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.15.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 288/1147]          blk.15.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.15.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 289/1147]            blk.15.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.15.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 290/1147]               blk.15.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 291/1147]              blk.16.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 292/1147]           blk.16.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 293/1147]         blk.16.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.16.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 294/1147]         blk.16.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.16.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 295/1147]           blk.16.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.16.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 296/1147]         blk.16.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 297/1147]          blk.16.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 298/1147]              blk.16.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 299/1147]               blk.16.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.16.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 300/1147]               blk.16.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 301/1147]            blk.16.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.16.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 302/1147]          blk.16.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 303/1147]               blk.16.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 304/1147]               blk.16.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 305/1147]              blk.16.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 306/1147]          blk.16.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.16.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 307/1147]          blk.16.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.16.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 308/1147]            blk.16.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.16.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 309/1147]               blk.16.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 310/1147]              blk.17.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 311/1147]           blk.17.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 312/1147]         blk.17.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.17.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 313/1147]         blk.17.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.17.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 314/1147]           blk.17.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.17.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 315/1147]         blk.17.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 316/1147]          blk.17.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 317/1147]              blk.17.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 318/1147]               blk.17.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.17.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 319/1147]               blk.17.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 320/1147]            blk.17.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.17.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 321/1147]          blk.17.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 322/1147]               blk.17.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 323/1147]               blk.17.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 324/1147]              blk.17.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 325/1147]          blk.17.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.17.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 326/1147]          blk.17.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.17.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 327/1147]            blk.17.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.17.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 328/1147]               blk.17.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 329/1147]              blk.18.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 330/1147]           blk.18.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 331/1147]         blk.18.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.18.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 332/1147]         blk.18.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.18.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 333/1147]           blk.18.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.18.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 334/1147]         blk.18.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 335/1147]          blk.18.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 336/1147]              blk.18.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 337/1147]               blk.18.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.18.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 338/1147]               blk.18.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 339/1147]            blk.18.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.18.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 340/1147]          blk.18.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 341/1147]               blk.18.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 342/1147]               blk.18.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 343/1147]              blk.18.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 344/1147]          blk.18.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.18.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 345/1147]          blk.18.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.18.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 346/1147]            blk.18.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.18.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 347/1147]               blk.18.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 348/1147]              blk.19.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 349/1147]           blk.19.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 350/1147]         blk.19.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.19.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 351/1147]         blk.19.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.19.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 352/1147]           blk.19.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.19.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 353/1147]         blk.19.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 354/1147]          blk.19.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 355/1147]              blk.19.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 356/1147]               blk.19.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.19.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 357/1147]               blk.19.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 358/1147]            blk.19.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.19.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 359/1147]          blk.19.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 360/1147]               blk.19.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 361/1147]               blk.19.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 362/1147]              blk.19.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 363/1147]          blk.19.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.19.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 364/1147]          blk.19.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.19.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 365/1147]            blk.19.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.19.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 366/1147]               blk.19.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 367/1147]              blk.20.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 368/1147]           blk.20.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 369/1147]         blk.20.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.20.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 370/1147]         blk.20.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.20.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 371/1147]           blk.20.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.20.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 372/1147]         blk.20.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 373/1147]          blk.20.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 374/1147]              blk.20.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 375/1147]               blk.20.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.20.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 376/1147]               blk.20.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 377/1147]            blk.20.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.20.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 378/1147]          blk.20.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 379/1147]               blk.20.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 380/1147]               blk.20.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 381/1147]              blk.20.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 382/1147]          blk.20.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.20.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 383/1147]          blk.20.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.20.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 384/1147]            blk.20.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.20.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 385/1147]               blk.20.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 386/1147]              blk.21.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 387/1147]           blk.21.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 388/1147]         blk.21.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.21.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 389/1147]         blk.21.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.21.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 390/1147]           blk.21.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.21.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 391/1147]         blk.21.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 392/1147]          blk.21.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 393/1147]              blk.21.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 394/1147]               blk.21.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.21.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 395/1147]               blk.21.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 396/1147]            blk.21.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.21.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 397/1147]          blk.21.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 398/1147]               blk.21.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 399/1147]               blk.21.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 400/1147]              blk.21.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 401/1147]          blk.21.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.21.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 402/1147]          blk.21.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.21.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 403/1147]            blk.21.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.21.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 404/1147]               blk.21.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 405/1147]              blk.22.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 406/1147]           blk.22.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 407/1147]         blk.22.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.22.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 408/1147]         blk.22.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.22.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 409/1147]           blk.22.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.22.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 410/1147]         blk.22.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 411/1147]          blk.22.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 412/1147]              blk.22.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 413/1147]               blk.22.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.22.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 414/1147]               blk.22.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 415/1147]            blk.22.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.22.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 416/1147]          blk.22.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 417/1147]               blk.22.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 418/1147]               blk.22.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 419/1147]              blk.22.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 420/1147]          blk.22.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.22.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 421/1147]          blk.22.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.22.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 422/1147]            blk.22.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.22.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 423/1147]               blk.22.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 424/1147]              blk.23.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 425/1147]           blk.23.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 426/1147]         blk.23.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.23.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 427/1147]         blk.23.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.23.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 428/1147]           blk.23.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.23.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 429/1147]         blk.23.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 430/1147]          blk.23.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 431/1147]              blk.23.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 432/1147]               blk.23.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.23.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 433/1147]               blk.23.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 434/1147]            blk.23.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.23.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 435/1147]          blk.23.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 436/1147]               blk.23.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 437/1147]               blk.23.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 438/1147]              blk.23.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 439/1147]          blk.23.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.23.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 440/1147]          blk.23.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.23.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 441/1147]            blk.23.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.23.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 442/1147]               blk.23.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 443/1147]              blk.24.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 444/1147]           blk.24.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 445/1147]         blk.24.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.24.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 446/1147]         blk.24.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.24.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 447/1147]           blk.24.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.24.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 448/1147]         blk.24.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 449/1147]          blk.24.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 450/1147]              blk.24.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 451/1147]               blk.24.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.24.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 452/1147]               blk.24.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 453/1147]            blk.24.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.24.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 454/1147]          blk.24.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 455/1147]               blk.24.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 456/1147]               blk.24.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 457/1147]              blk.24.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 458/1147]          blk.24.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.24.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 459/1147]          blk.24.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.24.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 460/1147]            blk.24.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.24.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 461/1147]               blk.24.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 462/1147]              blk.25.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 463/1147]           blk.25.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 464/1147]         blk.25.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.25.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 465/1147]         blk.25.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.25.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 466/1147]           blk.25.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.25.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 467/1147]         blk.25.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 468/1147]          blk.25.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 469/1147]              blk.25.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 470/1147]               blk.25.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.25.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 471/1147]               blk.25.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 472/1147]            blk.25.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.25.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 473/1147]          blk.25.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 474/1147]               blk.25.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 475/1147]               blk.25.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 476/1147]              blk.25.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 477/1147]          blk.25.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.25.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 478/1147]          blk.25.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.25.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 479/1147]            blk.25.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.25.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 480/1147]               blk.25.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 481/1147]              blk.26.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 482/1147]           blk.26.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 483/1147]         blk.26.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.26.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 484/1147]         blk.26.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.26.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 485/1147]           blk.26.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.26.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 486/1147]         blk.26.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 487/1147]          blk.26.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 488/1147]              blk.26.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 489/1147]               blk.26.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.26.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 490/1147]               blk.26.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 491/1147]            blk.26.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.26.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 492/1147]          blk.26.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 493/1147]               blk.26.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 494/1147]               blk.26.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 495/1147]              blk.26.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 496/1147]          blk.26.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.26.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 497/1147]          blk.26.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.26.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 498/1147]            blk.26.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.26.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 499/1147]               blk.26.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 500/1147]              blk.27.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 501/1147]           blk.27.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 502/1147]         blk.27.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.27.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 503/1147]         blk.27.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.27.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 504/1147]           blk.27.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.27.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 505/1147]         blk.27.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 506/1147]          blk.27.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 507/1147]              blk.27.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 508/1147]               blk.27.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.27.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 509/1147]               blk.27.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 510/1147]            blk.27.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.27.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 511/1147]          blk.27.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 512/1147]               blk.27.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 513/1147]               blk.27.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 514/1147]              blk.27.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 515/1147]          blk.27.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.27.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 516/1147]          blk.27.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.27.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 517/1147]            blk.27.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.27.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 518/1147]               blk.27.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 519/1147]              blk.28.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 520/1147]           blk.28.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 521/1147]         blk.28.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.28.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 522/1147]         blk.28.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.28.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 523/1147]           blk.28.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.28.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 524/1147]         blk.28.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 525/1147]          blk.28.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 526/1147]              blk.28.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 527/1147]               blk.28.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.28.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 528/1147]               blk.28.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 529/1147]            blk.28.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.28.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 530/1147]          blk.28.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 531/1147]               blk.28.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 532/1147]               blk.28.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 533/1147]              blk.28.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 534/1147]          blk.28.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.28.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 535/1147]          blk.28.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.28.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 536/1147]            blk.28.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.28.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 537/1147]               blk.28.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 538/1147]              blk.29.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 539/1147]           blk.29.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 540/1147]         blk.29.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.29.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 541/1147]         blk.29.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.29.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 542/1147]           blk.29.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.29.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 543/1147]         blk.29.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 544/1147]          blk.29.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 545/1147]              blk.29.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 546/1147]               blk.29.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.29.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 547/1147]               blk.29.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 548/1147]            blk.29.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.29.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 549/1147]          blk.29.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 550/1147]               blk.29.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 551/1147]               blk.29.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 552/1147]              blk.29.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 553/1147]          blk.29.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.29.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 554/1147]          blk.29.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.29.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 555/1147]            blk.29.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.29.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 556/1147]               blk.29.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 557/1147]              blk.30.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 558/1147]           blk.30.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 559/1147]         blk.30.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.30.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 560/1147]         blk.30.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.30.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 561/1147]           blk.30.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.30.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 562/1147]         blk.30.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 563/1147]          blk.30.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 564/1147]              blk.30.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 565/1147]               blk.30.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.30.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 566/1147]               blk.30.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 567/1147]            blk.30.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.30.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 568/1147]          blk.30.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 569/1147]               blk.30.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 570/1147]               blk.30.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 571/1147]              blk.30.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 572/1147]          blk.30.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.30.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 573/1147]          blk.30.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.30.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 574/1147]            blk.30.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.30.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 575/1147]               blk.30.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 576/1147]              blk.31.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 577/1147]           blk.31.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 578/1147]         blk.31.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.31.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 579/1147]         blk.31.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.31.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 580/1147]           blk.31.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.31.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 581/1147]         blk.31.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 582/1147]          blk.31.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 583/1147]              blk.31.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 584/1147]               blk.31.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.31.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 585/1147]               blk.31.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 586/1147]            blk.31.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.31.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 587/1147]          blk.31.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 588/1147]               blk.31.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 589/1147]               blk.31.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 590/1147]              blk.31.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 591/1147]          blk.31.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.31.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 592/1147]          blk.31.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.31.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 593/1147]            blk.31.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.31.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 594/1147]               blk.31.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 595/1147]              blk.32.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 596/1147]           blk.32.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 597/1147]         blk.32.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.32.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 598/1147]         blk.32.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.32.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 599/1147]           blk.32.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.32.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 600/1147]         blk.32.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 601/1147]          blk.32.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 602/1147]              blk.32.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 603/1147]               blk.32.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.32.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 604/1147]               blk.32.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 605/1147]            blk.32.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.32.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 606/1147]          blk.32.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 607/1147]               blk.32.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 608/1147]               blk.32.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 609/1147]              blk.32.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 610/1147]          blk.32.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.32.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 611/1147]          blk.32.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.32.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 612/1147]            blk.32.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.32.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 613/1147]               blk.32.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 614/1147]              blk.33.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 615/1147]           blk.33.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 616/1147]         blk.33.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.33.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 617/1147]         blk.33.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.33.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 618/1147]           blk.33.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.33.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 619/1147]         blk.33.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 620/1147]          blk.33.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 621/1147]              blk.33.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 622/1147]               blk.33.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.33.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 623/1147]               blk.33.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 624/1147]            blk.33.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.33.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 625/1147]          blk.33.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 626/1147]               blk.33.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 627/1147]               blk.33.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 628/1147]              blk.33.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 629/1147]          blk.33.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.33.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 630/1147]          blk.33.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.33.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 631/1147]            blk.33.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.33.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 632/1147]               blk.33.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 633/1147]              blk.34.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 634/1147]           blk.34.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 635/1147]         blk.34.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.34.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 636/1147]         blk.34.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.34.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 637/1147]           blk.34.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.34.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 638/1147]         blk.34.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 639/1147]          blk.34.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 640/1147]              blk.34.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 641/1147]               blk.34.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.34.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 642/1147]               blk.34.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 643/1147]            blk.34.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.34.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 644/1147]          blk.34.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 645/1147]               blk.34.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 646/1147]               blk.34.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 647/1147]              blk.34.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 648/1147]          blk.34.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.34.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 649/1147]          blk.34.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.34.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 650/1147]            blk.34.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.34.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 651/1147]               blk.34.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 652/1147]              blk.35.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 653/1147]           blk.35.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 654/1147]         blk.35.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.35.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 655/1147]         blk.35.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.35.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 656/1147]           blk.35.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.35.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 657/1147]         blk.35.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 658/1147]          blk.35.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 659/1147]              blk.35.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 660/1147]               blk.35.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.35.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 661/1147]               blk.35.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 662/1147]            blk.35.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.35.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 663/1147]          blk.35.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 664/1147]               blk.35.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 665/1147]               blk.35.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 666/1147]              blk.35.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 667/1147]          blk.35.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.35.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 668/1147]          blk.35.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.35.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 669/1147]            blk.35.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.35.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 670/1147]               blk.35.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 671/1147]              blk.36.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 672/1147]           blk.36.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 673/1147]         blk.36.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.36.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 674/1147]         blk.36.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.36.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 675/1147]           blk.36.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.36.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 676/1147]         blk.36.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 677/1147]          blk.36.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 678/1147]              blk.36.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 679/1147]               blk.36.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.36.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 680/1147]               blk.36.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 681/1147]            blk.36.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.36.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 682/1147]          blk.36.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 683/1147]               blk.36.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 684/1147]               blk.36.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 685/1147]              blk.36.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 686/1147]          blk.36.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.36.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 687/1147]          blk.36.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.36.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 688/1147]            blk.36.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.36.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 689/1147]               blk.36.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 690/1147]              blk.37.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 691/1147]           blk.37.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 692/1147]         blk.37.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.37.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 693/1147]         blk.37.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.37.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 694/1147]           blk.37.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.37.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 695/1147]         blk.37.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 696/1147]          blk.37.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 697/1147]              blk.37.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 698/1147]               blk.37.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.37.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 699/1147]               blk.37.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 700/1147]            blk.37.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.37.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 701/1147]          blk.37.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 702/1147]               blk.37.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 703/1147]               blk.37.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 704/1147]              blk.37.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 705/1147]          blk.37.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.37.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 706/1147]          blk.37.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.37.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 707/1147]            blk.37.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.37.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 708/1147]               blk.37.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 709/1147]              blk.38.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 710/1147]           blk.38.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 711/1147]         blk.38.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.38.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 712/1147]         blk.38.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.38.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 713/1147]           blk.38.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.38.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 714/1147]         blk.38.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 715/1147]          blk.38.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 716/1147]              blk.38.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 717/1147]               blk.38.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.38.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 718/1147]               blk.38.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 719/1147]            blk.38.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.38.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 720/1147]          blk.38.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 721/1147]               blk.38.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 722/1147]               blk.38.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 723/1147]              blk.38.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 724/1147]          blk.38.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.38.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 725/1147]          blk.38.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.38.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 726/1147]            blk.38.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.38.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 727/1147]               blk.38.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 728/1147]              blk.39.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 729/1147]           blk.39.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 730/1147]         blk.39.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.39.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 731/1147]         blk.39.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.39.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 732/1147]           blk.39.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.39.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 733/1147]         blk.39.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 734/1147]          blk.39.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 735/1147]              blk.39.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 736/1147]               blk.39.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.39.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 737/1147]               blk.39.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 738/1147]            blk.39.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.39.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 739/1147]          blk.39.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 740/1147]               blk.39.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 741/1147]               blk.39.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 742/1147]              blk.39.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 743/1147]          blk.39.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.39.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 744/1147]          blk.39.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.39.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 745/1147]            blk.39.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.39.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 746/1147]               blk.39.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 747/1147]              blk.40.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 748/1147]           blk.40.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 749/1147]         blk.40.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.40.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 750/1147]         blk.40.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.40.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 751/1147]           blk.40.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.40.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 752/1147]         blk.40.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 753/1147]          blk.40.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 754/1147]              blk.40.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 755/1147]               blk.40.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.40.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 756/1147]               blk.40.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 757/1147]            blk.40.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.40.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 758/1147]          blk.40.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 759/1147]               blk.40.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 760/1147]               blk.40.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 761/1147]              blk.40.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 762/1147]          blk.40.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.40.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 763/1147]          blk.40.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.40.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 764/1147]            blk.40.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.40.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 765/1147]               blk.40.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 766/1147]              blk.41.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 767/1147]           blk.41.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 768/1147]         blk.41.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.41.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 769/1147]         blk.41.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.41.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 770/1147]           blk.41.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.41.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 771/1147]         blk.41.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 772/1147]          blk.41.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 773/1147]              blk.41.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 774/1147]               blk.41.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.41.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 775/1147]               blk.41.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 776/1147]            blk.41.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.41.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 777/1147]          blk.41.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 778/1147]               blk.41.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 779/1147]               blk.41.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 780/1147]              blk.41.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 781/1147]          blk.41.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.41.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 782/1147]          blk.41.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.41.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 783/1147]            blk.41.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.41.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 784/1147]               blk.41.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 785/1147]              blk.42.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 786/1147]           blk.42.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 787/1147]         blk.42.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.42.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 788/1147]         blk.42.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.42.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 789/1147]           blk.42.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.42.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 790/1147]         blk.42.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 791/1147]          blk.42.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 792/1147]              blk.42.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 793/1147]               blk.42.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.42.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 794/1147]               blk.42.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 795/1147]            blk.42.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.42.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 796/1147]          blk.42.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 797/1147]               blk.42.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 798/1147]               blk.42.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 799/1147]              blk.42.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 800/1147]          blk.42.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.42.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 801/1147]          blk.42.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.42.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 802/1147]            blk.42.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.42.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 803/1147]               blk.42.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 804/1147]              blk.43.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 805/1147]           blk.43.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 806/1147]         blk.43.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.43.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 807/1147]         blk.43.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.43.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 808/1147]           blk.43.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.43.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 809/1147]         blk.43.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 810/1147]          blk.43.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 811/1147]              blk.43.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 812/1147]               blk.43.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.43.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 813/1147]               blk.43.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 814/1147]            blk.43.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.43.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 815/1147]          blk.43.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 816/1147]               blk.43.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 817/1147]               blk.43.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 818/1147]              blk.43.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 819/1147]          blk.43.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.43.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 820/1147]          blk.43.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.43.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 821/1147]            blk.43.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.43.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 822/1147]               blk.43.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 823/1147]              blk.44.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 824/1147]           blk.44.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 825/1147]         blk.44.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.44.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 826/1147]         blk.44.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.44.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 827/1147]           blk.44.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.44.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 828/1147]         blk.44.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 829/1147]          blk.44.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 830/1147]              blk.44.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 831/1147]               blk.44.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.44.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 832/1147]               blk.44.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 833/1147]            blk.44.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.44.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 834/1147]          blk.44.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 835/1147]               blk.44.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 836/1147]               blk.44.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 837/1147]              blk.44.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 838/1147]          blk.44.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.44.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 839/1147]          blk.44.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.44.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 840/1147]            blk.44.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.44.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 841/1147]               blk.44.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 842/1147]              blk.45.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 843/1147]           blk.45.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 844/1147]         blk.45.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.45.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 845/1147]         blk.45.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.45.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 846/1147]           blk.45.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.45.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 847/1147]         blk.45.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 848/1147]          blk.45.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 849/1147]              blk.45.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 850/1147]               blk.45.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.45.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 851/1147]               blk.45.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 852/1147]            blk.45.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.45.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 853/1147]          blk.45.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 854/1147]               blk.45.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 855/1147]               blk.45.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 856/1147]              blk.45.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 857/1147]          blk.45.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.45.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 858/1147]          blk.45.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.45.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 859/1147]            blk.45.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.45.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 860/1147]               blk.45.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 861/1147]              blk.46.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 862/1147]           blk.46.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 863/1147]         blk.46.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.46.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 864/1147]         blk.46.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.46.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 865/1147]           blk.46.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.46.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 866/1147]         blk.46.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 867/1147]          blk.46.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 868/1147]              blk.46.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 869/1147]               blk.46.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.46.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 870/1147]               blk.46.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 871/1147]            blk.46.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.46.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 872/1147]          blk.46.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 873/1147]               blk.46.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 874/1147]               blk.46.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 875/1147]              blk.46.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 876/1147]          blk.46.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.46.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 877/1147]          blk.46.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.46.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 878/1147]            blk.46.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.46.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 879/1147]               blk.46.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 880/1147]              blk.47.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 881/1147]           blk.47.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 882/1147]         blk.47.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.47.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 883/1147]         blk.47.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.47.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 884/1147]           blk.47.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.47.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 885/1147]         blk.47.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 886/1147]          blk.47.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 887/1147]              blk.47.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 888/1147]               blk.47.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.47.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 889/1147]               blk.47.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 890/1147]            blk.47.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.47.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 891/1147]          blk.47.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 892/1147]               blk.47.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 893/1147]               blk.47.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 894/1147]              blk.47.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 895/1147]          blk.47.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.47.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 896/1147]          blk.47.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.47.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 897/1147]            blk.47.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.47.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 898/1147]               blk.47.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 899/1147]              blk.48.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 900/1147]           blk.48.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 901/1147]         blk.48.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.48.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 902/1147]         blk.48.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.48.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 903/1147]           blk.48.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.48.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 904/1147]         blk.48.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 905/1147]          blk.48.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 906/1147]              blk.48.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 907/1147]               blk.48.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.48.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 908/1147]               blk.48.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 909/1147]            blk.48.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.48.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 910/1147]          blk.48.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 911/1147]               blk.48.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 912/1147]               blk.48.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 913/1147]              blk.48.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 914/1147]          blk.48.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.48.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 915/1147]          blk.48.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.48.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 916/1147]            blk.48.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.48.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 917/1147]               blk.48.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 918/1147]              blk.49.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 919/1147]           blk.49.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 920/1147]         blk.49.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.49.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 921/1147]         blk.49.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.49.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 922/1147]           blk.49.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.49.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 923/1147]         blk.49.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 924/1147]          blk.49.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 925/1147]              blk.49.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 926/1147]               blk.49.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.49.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 927/1147]               blk.49.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 928/1147]            blk.49.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.49.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 929/1147]          blk.49.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 930/1147]               blk.49.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 931/1147]               blk.49.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 932/1147]              blk.49.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 933/1147]          blk.49.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.49.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 934/1147]          blk.49.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.49.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 935/1147]            blk.49.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.49.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 936/1147]               blk.49.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 937/1147]              blk.50.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 938/1147]           blk.50.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 939/1147]         blk.50.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.50.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 940/1147]         blk.50.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.50.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 941/1147]           blk.50.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.50.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 942/1147]         blk.50.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 943/1147]          blk.50.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 944/1147]              blk.50.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 945/1147]               blk.50.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.50.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 946/1147]               blk.50.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 947/1147]            blk.50.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.50.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 948/1147]          blk.50.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 949/1147]               blk.50.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 950/1147]               blk.50.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 951/1147]              blk.50.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 952/1147]          blk.50.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.50.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 953/1147]          blk.50.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.50.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 954/1147]            blk.50.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.50.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 955/1147]               blk.50.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 956/1147]              blk.51.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 957/1147]           blk.51.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 958/1147]         blk.51.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.51.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 959/1147]         blk.51.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.51.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 960/1147]           blk.51.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.51.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 961/1147]         blk.51.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 962/1147]          blk.51.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 963/1147]              blk.51.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 964/1147]               blk.51.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.51.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 965/1147]               blk.51.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 966/1147]            blk.51.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.51.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 967/1147]          blk.51.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 968/1147]               blk.51.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 969/1147]               blk.51.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 970/1147]              blk.51.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 971/1147]          blk.51.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.51.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 972/1147]          blk.51.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.51.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 973/1147]            blk.51.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.51.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 974/1147]               blk.51.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 975/1147]              blk.52.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 976/1147]           blk.52.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 977/1147]         blk.52.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.52.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 978/1147]         blk.52.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.52.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 979/1147]           blk.52.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.52.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 980/1147]         blk.52.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[ 981/1147]          blk.52.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[ 982/1147]              blk.52.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[ 983/1147]               blk.52.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.52.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[ 984/1147]               blk.52.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[ 985/1147]            blk.52.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.52.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[ 986/1147]          blk.52.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[ 987/1147]               blk.52.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[ 988/1147]               blk.52.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[ 989/1147]              blk.52.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 990/1147]          blk.52.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.52.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[ 991/1147]          blk.52.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.52.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 992/1147]            blk.52.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.52.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[ 993/1147]               blk.52.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[ 994/1147]              blk.53.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[ 995/1147]           blk.53.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[ 996/1147]         blk.53.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.53.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 997/1147]         blk.53.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.53.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 998/1147]           blk.53.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.53.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[ 999/1147]         blk.53.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[1000/1147]          blk.53.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[1001/1147]              blk.53.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[1002/1147]               blk.53.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.53.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[1003/1147]               blk.53.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[1004/1147]            blk.53.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.53.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[1005/1147]          blk.53.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[1006/1147]               blk.53.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[1007/1147]               blk.53.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[1008/1147]              blk.53.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1009/1147]          blk.53.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.53.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[1010/1147]          blk.53.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.53.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1011/1147]            blk.53.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.53.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1012/1147]               blk.53.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1013/1147]              blk.54.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[1014/1147]           blk.54.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[1015/1147]         blk.54.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.54.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1016/1147]         blk.54.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.54.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1017/1147]           blk.54.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.54.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1018/1147]         blk.54.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[1019/1147]          blk.54.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[1020/1147]              blk.54.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[1021/1147]               blk.54.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.54.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[1022/1147]               blk.54.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[1023/1147]            blk.54.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.54.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[1024/1147]          blk.54.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[1025/1147]               blk.54.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[1026/1147]               blk.54.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[1027/1147]              blk.54.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1028/1147]          blk.54.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.54.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[1029/1147]          blk.54.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.54.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1030/1147]            blk.54.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.54.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1031/1147]               blk.54.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1032/1147]              blk.55.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[1033/1147]           blk.55.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[1034/1147]         blk.55.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.55.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1035/1147]         blk.55.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.55.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1036/1147]           blk.55.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.55.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1037/1147]         blk.55.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[1038/1147]          blk.55.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[1039/1147]              blk.55.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[1040/1147]               blk.55.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.55.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[1041/1147]               blk.55.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[1042/1147]            blk.55.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.55.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[1043/1147]          blk.55.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[1044/1147]               blk.55.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[1045/1147]               blk.55.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[1046/1147]              blk.55.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1047/1147]          blk.55.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.55.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[1048/1147]          blk.55.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.55.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1049/1147]            blk.55.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.55.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1050/1147]               blk.55.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1051/1147]              blk.56.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[1052/1147]           blk.56.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[1053/1147]         blk.56.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.56.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1054/1147]         blk.56.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.56.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1055/1147]           blk.56.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.56.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1056/1147]         blk.56.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[1057/1147]          blk.56.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[1058/1147]              blk.56.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[1059/1147]               blk.56.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.56.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[1060/1147]               blk.56.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[1061/1147]            blk.56.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.56.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[1062/1147]          blk.56.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[1063/1147]               blk.56.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[1064/1147]               blk.56.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[1065/1147]              blk.56.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1066/1147]          blk.56.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.56.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[1067/1147]          blk.56.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.56.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1068/1147]            blk.56.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.56.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1069/1147]               blk.56.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1070/1147]              blk.57.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[1071/1147]           blk.57.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[1072/1147]         blk.57.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.57.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1073/1147]         blk.57.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.57.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1074/1147]           blk.57.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.57.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1075/1147]         blk.57.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[1076/1147]          blk.57.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[1077/1147]              blk.57.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[1078/1147]               blk.57.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.57.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[1079/1147]               blk.57.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[1080/1147]            blk.57.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.57.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[1081/1147]          blk.57.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[1082/1147]               blk.57.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[1083/1147]               blk.57.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[1084/1147]              blk.57.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1085/1147]          blk.57.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.57.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[1086/1147]          blk.57.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.57.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1087/1147]            blk.57.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.57.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1088/1147]               blk.57.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1089/1147]              blk.58.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[1090/1147]           blk.58.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[1091/1147]         blk.58.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.58.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1092/1147]         blk.58.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.58.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1093/1147]           blk.58.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.58.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1094/1147]         blk.58.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[1095/1147]          blk.58.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[1096/1147]              blk.58.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[1097/1147]               blk.58.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.58.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[1098/1147]               blk.58.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[1099/1147]            blk.58.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.58.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[1100/1147]          blk.58.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[1101/1147]               blk.58.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[1102/1147]               blk.58.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[1103/1147]              blk.58.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1104/1147]          blk.58.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.58.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[1105/1147]          blk.58.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.58.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1106/1147]            blk.58.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.58.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1107/1147]               blk.58.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1108/1147]              blk.59.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[1109/1147]           blk.59.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[1110/1147]         blk.59.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.59.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1111/1147]         blk.59.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.59.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1112/1147]           blk.59.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.59.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1113/1147]         blk.59.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[1114/1147]          blk.59.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[1115/1147]              blk.59.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[1116/1147]               blk.59.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.59.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[1117/1147]               blk.59.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[1118/1147]            blk.59.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.59.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[1119/1147]          blk.59.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[1120/1147]               blk.59.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[1121/1147]               blk.59.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[1122/1147]              blk.59.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1123/1147]          blk.59.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.59.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[1124/1147]          blk.59.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.59.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1125/1147]            blk.59.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.59.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1126/1147]               blk.59.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1127/1147]              blk.60.exp_probs_b.bias - [  256,     1,     1,     1], type =    f32, size =    0.001 MB
[1128/1147]           blk.60.ffn_gate_inp.weight - [ 7168,   256,     1,     1], type =    f32, size =    7.000 MB
[1129/1147]         blk.60.ffn_down_shexp.weight - [ 2048,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.60.ffn_down_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1130/1147]         blk.60.ffn_gate_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.60.ffn_gate_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1131/1147]           blk.60.ffn_up_shexp.weight - [ 7168,  2048,     1,     1], type =    f16, Using custom type q6_K for tensor blk.60.ffn_up_shexp.weight
converting to q6_K .. size =    28.00 MiB ->    11.48 MiB
[1132/1147]         blk.60.attn_kv_a_norm.weight - [  512,     1,     1,     1], type =    f32, size =    0.002 MB
[1133/1147]          blk.60.attn_kv_a_mqa.weight - [ 7168,   576,     1,     1], type =    f16, converting to q6_K .. size =     7.88 MiB ->     3.23 MiB
[1134/1147]              blk.60.attn_kv_b.weight - [  512, 32768,     1,     1], type =    f16, converting to q6_K .. size =    32.00 MiB ->    13.12 MiB
[1135/1147]               blk.60.attn_k_b.weight - [  128, 65536,     1,     1], type =    f16, 

llama_tensor_get_type : tensor cols 128 x 65536 are not divisible by 256, required for q6_K - using fallback quantization q8_0

====== llama_model_quantize_internal: did not find weights for blk.60.attn_k_b.weight
converting to q8_0 .. size =    16.00 MiB ->     8.50 MiB
[1136/1147]               blk.60.attn_v_b.weight - [  512, 16384,     1,     1], type =    f16, converting to q6_K .. size =    16.00 MiB ->     6.56 MiB
[1137/1147]            blk.60.attn_output.weight - [16384,  7168,     1,     1], type =    f16, Using custom type q6_K for tensor blk.60.attn_output.weight
converting to q6_K .. size =   224.00 MiB ->    91.88 MiB
[1138/1147]          blk.60.attn_q_a_norm.weight - [ 1536,     1,     1,     1], type =    f32, size =    0.006 MB
[1139/1147]               blk.60.attn_q_a.weight - [ 7168,  1536,     1,     1], type =    f16, converting to q6_K .. size =    21.00 MiB ->     8.61 MiB
[1140/1147]               blk.60.attn_q_b.weight - [ 1536, 24576,     1,     1], type =    f16, converting to q6_K .. size =    72.00 MiB ->    29.53 MiB
[1141/1147]                        output.weight - [ 7168, 129280,     1,     1], type =    f16, Using custom type q6_K for tensor output.weight

====== llama_model_quantize_internal: did not find weights for output.weight
converting to q6_K .. size =  1767.50 MiB ->   724.95 MiB
[1142/1147]              blk.60.attn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1143/1147]          blk.60.ffn_down_exps.weight - [ 2048,  7168,   256,     1], type =    f16, Using custom type iq4_k for tensor blk.60.ffn_down_exps.weight
converting to iq4_k .. size =  7168.00 MiB ->  2016.00 MiB
[1144/1147]          blk.60.ffn_gate_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.60.ffn_gate_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1145/1147]            blk.60.ffn_up_exps.weight - [ 7168,  2048,   256,     1], type =    f16, Using custom type iq3_k for tensor blk.60.ffn_up_exps.weight
converting to iq3_k .. size =  7168.00 MiB ->  1540.00 MiB
[1146/1147]               blk.60.ffn_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
[1147/1147]                   output_norm.weight - [ 7168,     1,     1,     1], type =    f32, size =    0.027 MB
llama_model_quantize_internal: model size  = 1282038.27 MB
llama_model_quantize_internal: quant size  = 318818.01 MB
llama_model_quantize_internal: WARNING: 61 of 785 tensor(s) required fallback quantization

👤 davidsyoung commented the 2025-03-18 at 10:41:34:

PPL run

./build/bin/llama-perplexity -m /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf -f /models/wiki.test.raw -fmoe -mla 2 -fa -ts 24/24/24/24/24/24/24/24/24/24/24/24/24/24/24/24 -c 512 -ub 512 --n-gpu-layers 999 -ot "blk\.3\.ffn_(down|gate|up)_exps\.weight|blk\.4\.ffn_(down|gate|up)_exps\.weight|blk\.5\.ffn_(down|gate|up)_exps\.weight=CUDA0" -ot "blk\.6\.ffn_(down|gate|up)_exps\.weight|blk\.7\.ffn_(down|gate|up)_exps\.weight|blk\.8\.ffn_(down|gate|up)_exps\.weight=CUDA1" -ot "blk\.9\.ffn_(down|gate|up)_exps\.weight|blk\.10\.ffn_(down|gate|up)_exps\.weight|blk\.11\.ffn_(down|gate|up)_exps\.weight|blk\.12\.ffn_(down|gate|up)_exps\.weight=CUDA2" -ot "blk\.13\.ffn_(down|gate|up)_exps\.weight|blk\.14\.ffn_(down|gate|up)_exps\.weight|blk\.15\.ffn_(down|gate|up)_exps\.weight|blk\.16\.ffn_(down|gate|up)_exps\.weight=CUDA3" -ot "blk\.17\.ffn_(down|gate|up)_exps\.weight|blk\.18\.ffn_(down|gate|up)_exps\.weight|blk\.19\.ffn_(down|gate|up)_exps\.weight|blk\.20\.ffn_(down|gate|up)_exps\.weight=CUDA4" -ot "blk\.21\.ffn_(down|gate|up)_exps\.weight|blk\.22\.ffn_(down|gate|up)_exps\.weight|blk\.23\.ffn_(down|gate|up)_exps\.weight|blk\.24\.ffn_(down|gate|up)_exps\.weight=CUDA5" -ot "blk\.25\.ffn_(down|gate|up)_exps\.weight|blk\.26\.ffn_(down|gate|up)_exps\.weight|blk\.27\.ffn_(down|gate|up)_exps\.weight|blk\.28\.ffn_(down|gate|up)_exps\.weight=CUDA6" -ot "blk\.29\.ffn_(down|gate|up)_exps\.weight|blk\.30\.ffn_(down|gate|up)_exps\.weight|blk\.31\.ffn_(down|gate|up)_exps\.weight|blk\.32\.ffn_(down|gate|up)_exps\.weight=CUDA7" -ot "blk\.33\.ffn_(down|gate|up)_exps\.weight|blk\.34\.ffn_(down|gate|up)_exps\.weight|blk\.35\.ffn_(down|gate|up)_exps\.weight|blk\.36\.ffn_(down|gate|up)_exps\.weight=CUDA8" -ot "blk\.37\.ffn_(down|gate|up)_exps\.weight|blk\.38\.ffn_(down|gate|up)_exps\.weight|blk\.39\.ffn_(down|gate|up)_exps\.weight|blk\.40\.ffn_(down|gate|up)_exps\.weight=CUDA9" -ot "blk\.41\.ffn_(down|gate|up)_exps\.weight|blk\.42\.ffn_(down|gate|up)_exps\.weight|blk\.43\.ffn_(down|gate|up)_exps\.weight|blk\.44\.ffn_(down|gate|up)_exps\.weight=CUDA10" -ot "blk\.45\.ffn_(down|gate|up)_exps\.weight|blk\.46\.ffn_(down|gate|up)_exps\.weight|blk\.47\.ffn_(down|gate|up)_exps\.weight|blk\.48\.ffn_(down|gate|up)_exps\.weight=CUDA11" -ot "blk\.49\.ffn_(down|gate|up)_exps\.weight|blk\.50\.ffn_(down|gate|up)_exps\.weight|blk\.51\.ffn_(down|gate|up)_exps\.weight|blk\.52\.ffn_(down|gate|up)_exps\.weight=CUDA12" -ot "blk\.53\.ffn_(down|gate|up)_exps\.weight|blk\.54\.ffn_(down|gate|up)_exps\.weight|blk\.55\.ffn_(down|gate|up)_exps\.weight|blk\.56\.ffn_(down|gate|up)_exps\.weight=CUDA13" -ot "blk\.57\.ffn_(down|gate|up)_exps\.weight|blk\.58\.ffn_(down|gate|up)_exps\.weight|blk\.59\.ffn_(down|gate|up)_exps\.weight|blk\.60\.ffn_(down|gate|up)_exps\.weight=CUDA14" --seed 1741529602 --temp 0.5
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 16 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 3: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 4: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 5: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 6: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 7: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 8: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 9: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 10: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 11: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 12: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 13: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 14: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 15: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 1741529602
	llama_model_loader: loaded meta data with 54 key-value pairs and 1147 tensors from /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = deepseek2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = unsloth_DeepSeek R1 BF16
llama_model_loader: - kv   3:                         general.size_label str              = 256x21B
llama_model_loader: - kv   4:                            general.license str              = mit
llama_model_loader: - kv   5:                   general.base_model.count u32              = 1
llama_model_loader: - kv   6:                  general.base_model.0.name str              = DeepSeek R1
llama_model_loader: - kv   7:          general.base_model.0.organization str              = Deepseek Ai
llama_model_loader: - kv   8:              general.base_model.0.repo_url str              = https://huggingface.co/deepseek-ai/De...
llama_model_loader: - kv   9:                               general.tags arr[str,3]       = ["deepseek", "unsloth", "transformers"]
llama_model_loader: - kv  10:                          general.languages arr[str,1]       = ["en"]
llama_model_loader: - kv  11:                      deepseek2.block_count u32              = 61
llama_model_loader: - kv  12:                   deepseek2.context_length u32              = 163840
llama_model_loader: - kv  13:                 deepseek2.embedding_length u32              = 7168
llama_model_loader: - kv  14:              deepseek2.feed_forward_length u32              = 18432
llama_model_loader: - kv  15:             deepseek2.attention.head_count u32              = 128
llama_model_loader: - kv  16:          deepseek2.attention.head_count_kv u32              = 128
llama_model_loader: - kv  17:                   deepseek2.rope.freq_base f32              = 10000.000000
llama_model_loader: - kv  18: deepseek2.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  19:                deepseek2.expert_used_count u32              = 8
llama_model_loader: - kv  20:                          general.file_type u32              = 18
llama_model_loader: - kv  21:        deepseek2.leading_dense_block_count u32              = 3
llama_model_loader: - kv  22:                       deepseek2.vocab_size u32              = 129280
llama_model_loader: - kv  23:            deepseek2.attention.q_lora_rank u32              = 1536
llama_model_loader: - kv  24:           deepseek2.attention.kv_lora_rank u32              = 512
llama_model_loader: - kv  25:             deepseek2.attention.key_length u32              = 192
llama_model_loader: - kv  26:           deepseek2.attention.value_length u32              = 128
llama_model_loader: - kv  27:       deepseek2.expert_feed_forward_length u32              = 2048
llama_model_loader: - kv  28:                     deepseek2.expert_count u32              = 256
llama_model_loader: - kv  29:              deepseek2.expert_shared_count u32              = 1
llama_model_loader: - kv  30:             deepseek2.expert_weights_scale f32              = 2.500000
llama_model_loader: - kv  31:              deepseek2.expert_weights_norm bool             = true
llama_model_loader: - kv  32:               deepseek2.expert_gating_func u32              = 2
llama_model_loader: - kv  33:             deepseek2.rope.dimension_count u32              = 64
llama_model_loader: - kv  34:                deepseek2.rope.scaling.type str              = yarn
llama_model_loader: - kv  35:              deepseek2.rope.scaling.factor f32              = 40.000000
llama_model_loader: - kv  36: deepseek2.rope.scaling.original_context_length u32              = 4096
llama_model_loader: - kv  37: deepseek2.rope.scaling.yarn_log_multiplier f32              = 0.100000
llama_model_loader: - kv  38:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  39:                         tokenizer.ggml.pre str              = deepseek-v3
llama_model_loader: - kv  40:                      tokenizer.ggml.tokens arr[str,129280]  = ["<｜begin▁of▁sentence｜>", "<<3C>...
llama_model_loader: - kv  41:                  tokenizer.ggml.token_type arr[i32,129280]  = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  42:                      tokenizer.ggml.merges arr[str,127741]  = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv  43:                tokenizer.ggml.bos_token_id u32              = 0
llama_model_loader: - kv  44:                tokenizer.ggml.eos_token_id u32              = 1
llama_model_loader: - kv  45:            tokenizer.ggml.padding_token_id u32              = 128815
llama_model_loader: - kv  46:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  47:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  48:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
llama_model_loader: - kv  49:               general.quantization_version u32              = 2
llama_model_loader: - kv  50:                      quantize.imatrix.file str              = /models/deepseek-config/imatrix.dat
llama_model_loader: - kv  51:                   quantize.imatrix.dataset str              = imatrix-training-full-3
llama_model_loader: - kv  52:             quantize.imatrix.entries_count i32              = 720
llama_model_loader: - kv  53:              quantize.imatrix.chunks_count i32              = 315
llama_model_loader: - type  f32:  361 tensors
llama_model_loader: - type q8_0:   62 tensors
llama_model_loader: - type q5_K:    6 tensors
llama_model_loader: - type q6_K:  550 tensors
llama_model_loader: - type iq3_k:  104 tensors
llama_model_loader: - type iq4_k:   64 tensors
llm_load_vocab: special tokens cache size = 819
llm_load_vocab: token to piece cache size = 0.8223 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = deepseek2
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 129280
llm_load_print_meta: n_merges         = 127741
llm_load_print_meta: vocab_only       = 0
llm_load_print_meta: n_ctx_train      = 163840
llm_load_print_meta: n_embd           = 7168
llm_load_print_meta: n_layer          = 61
llm_load_print_meta: n_head           = 128
llm_load_print_meta: n_head_kv        = 128
llm_load_print_meta: n_rot            = 64
llm_load_print_meta: n_swa            = 0
llm_load_print_meta: n_embd_head_k    = 192
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 1
llm_load_print_meta: n_embd_k_gqa     = 24576
llm_load_print_meta: n_embd_v_gqa     = 16384
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale    = 0.0e+00
llm_load_print_meta: n_ff             = 18432
llm_load_print_meta: n_expert         = 256
llm_load_print_meta: n_expert_used    = 8
llm_load_print_meta: causal attn      = 1
llm_load_print_meta: pooling type     = 0
llm_load_print_meta: rope type        = 0
llm_load_print_meta: rope scaling     = yarn
llm_load_print_meta: freq_base_train  = 10000.0
llm_load_print_meta: freq_scale_train = 0.025
llm_load_print_meta: n_ctx_orig_yarn  = 4096
llm_load_print_meta: rope_finetuned   = unknown
llm_load_print_meta: ssm_d_conv       = 0
llm_load_print_meta: ssm_d_inner      = 0
llm_load_print_meta: ssm_d_state      = 0
llm_load_print_meta: ssm_dt_rank      = 0
llm_load_print_meta: model type       = 671B
llm_load_print_meta: model ftype      = Q6_K
llm_load_print_meta: model params     = 672.050 B
llm_load_print_meta: model size       = 311.346 GiB (3.980 BPW) 
llm_load_print_meta: repeating layers = 309.721 GiB (3.970 BPW, 670.196 B parameters)
llm_load_print_meta: general.name     = unsloth_DeepSeek R1 BF16
llm_load_print_meta: BOS token        = 0 '<｜begin▁of▁sentence｜>'
llm_load_print_meta: EOS token        = 1 '<｜end▁of▁sentence｜>'
llm_load_print_meta: PAD token        = 128815 '<｜PAD▁TOKEN｜>'
llm_load_print_meta: LF token         = 131 'Ä'
llm_load_print_meta: max token length = 256
llm_load_print_meta: n_layer_dense_lead   = 3
llm_load_print_meta: n_lora_q             = 1536
llm_load_print_meta: n_lora_kv            = 512
llm_load_print_meta: n_ff_exp             = 2048
llm_load_print_meta: n_expert_shared      = 1
llm_load_print_meta: expert_weights_scale = 2.5
llm_load_print_meta: expert_weights_norm  = 1
llm_load_print_meta: expert_gating_func   = sigmoid
llm_load_print_meta: rope_yarn_log_mul    = 0.1000
llm_load_tensors: ggml ctx size =    7.94 MiB
Tensor blk.3.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.6.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.9.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.13.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.17.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.21.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.25.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.29.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.33.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.37.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.41.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.45.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.49.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.53.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.57.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_up_exps.weight buffer type overriden to CUDA14
llm_load_tensors: offloading 61 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 62/62 layers to GPU
llm_load_tensors:        CPU buffer size =   938.98 MiB
llm_load_tensors:      CUDA0 buffer size = 21105.69 MiB
llm_load_tensors:      CUDA1 buffer size = 20299.82 MiB
llm_load_tensors:      CUDA2 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA3 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA4 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA5 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA6 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA7 buffer size = 20992.86 MiB
llm_load_tensors:      CUDA8 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA9 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA10 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA11 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA12 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA13 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA14 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA15 buffer size =  1130.89 MiB
....................................................................................................
llama_new_context_with_model: n_ctx      = 2048
llama_new_context_with_model: n_batch    = 2048
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 1
llama_new_context_with_model: mla_attn   = 2
llama_new_context_with_model: attn_max_b = 0
llama_new_context_with_model: fused_moe  = 1
llama_new_context_with_model: ser        = -1, 0
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 0.025
llama_kv_cache_init: layer 0: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 1: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 2: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 3: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 4: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 5: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 6: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 7: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 8: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 9: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 10: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 11: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 12: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 13: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 14: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 15: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 16: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 17: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 18: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 19: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 20: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 21: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 22: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 23: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 24: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 25: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 26: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 27: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 28: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 29: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 30: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 31: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 32: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 33: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 34: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 35: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 36: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 37: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 38: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 39: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 40: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 41: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 42: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 43: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 44: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 45: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 46: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 47: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 48: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 49: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 50: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 51: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 52: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 53: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 54: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 55: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 56: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 57: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 58: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 59: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 60: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init:      CUDA0 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA1 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA2 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA3 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA4 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA5 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA6 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA7 KV buffer size =     6.75 MiB
llama_kv_cache_init:      CUDA8 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA9 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA10 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA11 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA12 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA13 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA14 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA15 KV buffer size =     4.50 MiB
llama_new_context_with_model: KV self size  =  137.25 MiB, c^KV (f16):  137.25 MiB, kv^T: not used
llama_new_context_with_model:  CUDA_Host  output buffer size =     1.97 MiB
llama_new_context_with_model:      CUDA0 compute buffer size =   455.00 MiB
llama_new_context_with_model:      CUDA1 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA2 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA3 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA4 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA5 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA6 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA7 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA8 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA9 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA10 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA11 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA12 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA13 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA14 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA15 compute buffer size =   476.00 MiB
llama_new_context_with_model:  CUDA_Host compute buffer size =    18.01 MiB
llama_new_context_with_model: graph nodes  = 3487
llama_new_context_with_model: graph splits = 65

system_info: n_threads = 64 / 128 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | 
perplexity: tokenizing the input ..
perplexity: tokenization took 1167.48 ms
perplexity: calculating perplexity over 561 chunks, n_ctx=512, batch_size=2048, n_seq=4
perplexity: 22.93 seconds per pass - ETA 53.58 minutes
[1]2.5633,[2]3.3137,[3]nan,[4]nan,[5]nan,[6]nan,[7]nan,[8]nan,[9]nan,[10]nan,[11]nan,[12]nan,[13]nan,[14]nan,[15]nan,[16]nan,[17]nan,[18]nan,[19]nan,[20]nan,[21]nan,[22]nan,[23]nan,[24]nan,[25]nan,[26]nan,[27]nan,[28]nan,[29]nan,[30]nan,[31]nan,[32]nan,[33]nan,[34]nan,[35]nan,[36]nan,[37]nan,[38]nan,[39]nan,[40]nan,[41]nan,[42]nan,[43]nan,[44]nan,[45]nan,[46]nan,[47]nan,[48]nan,[49]nan,[50]nan,[51]nan,[52]nan,[53]nan,[54]nan,[55]nan,[56]nan,[57]nan,[58]nan,[59]nan,[60]nan,[61]nan,[62]nan,[63]nan,[64]nan,[65]nan,[66]nan,[67]nan,[68]nan,[69]nan,[70]nan,[71]nan,[72]nan,[73]nan,[74]nan,[75]nan,[76]nan,[77]nan,[78]nan,[79]nan,[80]nan,[81]nan,[82]nan,[83]nan,[84]nan,[85]nan,[86]nan,[87]nan,[88]nan,[89]nan,[90]nan,[91]nan,[92]nan,^C

👤 ikawrakow commented the 2025-03-18 at 10:45:08:

Did you enable the GGML_CUDA_IQK_FORCE_BF16 option when building?

👤 davidsyoung commented the 2025-03-18 at 10:49:14:

D'oh. Back to the drawing board. Apologies! Will report back.

👤 davidsyoung commented the 2025-03-18 at 13:11:13:

Works! Great work!

Successful PPL run

root@f9b3ae98b5a1:/app# ./build/bin/llama-perplexity -m /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf -f /models/wiki.test.raw -fmoe -mla 2 -fa -ts 24/24/24/24/24/24/24/24/24/24/24/24/24/24/24/24 -c 512 -ub 512 --n-gpu-layers 999 -ot "blk\.3\.ffn_(down|gate|up)_exps\.weight|blk\.4\.ffn_(down|gate|up)_exps\.weight|blk\.5\.ffn_(down|gate|up)_exps\.weight=CUDA0" -ot "blk\.6\.ffn_(down|gate|up)_exps\.weight|blk\.7\.ffn_(down|gate|up)_exps\.weight|blk\.8\.ffn_(down|gate|up)_exps\.weight=CUDA1" -ot "blk\.9\.ffn_(down|gate|up)_exps\.weight|blk\.10\.ffn_(down|gate|up)_exps\.weight|blk\.11\.ffn_(down|gate|up)_exps\.weight|blk\.12\.ffn_(down|gate|up)_exps\.weight=CUDA2" -ot "blk\.13\.ffn_(down|gate|up)_exps\.weight|blk\.14\.ffn_(down|gate|up)_exps\.weight|blk\.15\.ffn_(down|gate|up)_exps\.weight|blk\.16\.ffn_(down|gate|up)_exps\.weight=CUDA3" -ot "blk\.17\.ffn_(down|gate|up)_exps\.weight|blk\.18\.ffn_(down|gate|up)_exps\.weight|blk\.19\.ffn_(down|gate|up)_exps\.weight|blk\.20\.ffn_(down|gate|up)_exps\.weight=CUDA4" -ot "blk\.21\.ffn_(down|gate|up)_exps\.weight|blk\.22\.ffn_(down|gate|up)_exps\.weight|blk\.23\.ffn_(down|gate|up)_exps\.weight|blk\.24\.ffn_(down|gate|up)_exps\.weight=CUDA5" -ot "blk\.25\.ffn_(down|gate|up)_exps\.weight|blk\.26\.ffn_(down|gate|up)_exps\.weight|blk\.27\.ffn_(down|gate|up)_exps\.weight|blk\.28\.ffn_(down|gate|up)_exps\.weight=CUDA6" -ot "blk\.29\.ffn_(down|gate|up)_exps\.weight|blk\.30\.ffn_(down|gate|up)_exps\.weight|blk\.31\.ffn_(down|gate|up)_exps\.weight|blk\.32\.ffn_(down|gate|up)_exps\.weight=CUDA7" -ot "blk\.33\.ffn_(down|gate|up)_exps\.weight|blk\.34\.ffn_(down|gate|up)_exps\.weight|blk\.35\.ffn_(down|gate|up)_exps\.weight|blk\.36\.ffn_(down|gate|up)_exps\.weight=CUDA8" -ot "blk\.37\.ffn_(down|gate|up)_exps\.weight|blk\.38\.ffn_(down|gate|up)_exps\.weight|blk\.39\.ffn_(down|gate|up)_exps\.weight|blk\.40\.ffn_(down|gate|up)_exps\.weight=CUDA9" -ot "blk\.41\.ffn_(down|gate|up)_exps\.weight|blk\.42\.ffn_(down|gate|up)_exps\.weight|blk\.43\.ffn_(down|gate|up)_exps\.weight|blk\.44\.ffn_(down|gate|up)_exps\.weight=CUDA10" -ot "blk\.45\.ffn_(down|gate|up)_exps\.weight|blk\.46\.ffn_(down|gate|up)_exps\.weight|blk\.47\.ffn_(down|gate|up)_exps\.weight|blk\.48\.ffn_(down|gate|up)_exps\.weight=CUDA11" -ot "blk\.49\.ffn_(down|gate|up)_exps\.weight|blk\.50\.ffn_(down|gate|up)_exps\.weight|blk\.51\.ffn_(down|gate|up)_exps\.weight|blk\.52\.ffn_(down|gate|up)_exps\.weight=CUDA12" -ot "blk\.53\.ffn_(down|gate|up)_exps\.weight|blk\.54\.ffn_(down|gate|up)_exps\.weight|blk\.55\.ffn_(down|gate|up)_exps\.weight|blk\.56\.ffn_(down|gate|up)_exps\.weight=CUDA13" -ot "blk\.57\.ffn_(down|gate|up)_exps\.weight|blk\.58\.ffn_(down|gate|up)_exps\.weight|blk\.59\.ffn_(down|gate|up)_exps\.weight|blk\.60\.ffn_(down|gate|up)_exps\.weight=CUDA14" --seed 1741529602 --temp 0.5
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 16 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 3: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 4: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 5: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 6: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 7: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 8: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 9: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 10: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 11: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 12: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 13: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 14: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 15: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 1741529602
llama_model_loader: loaded meta data with 54 key-value pairs and 1147 tensors from /models/DeepSeek-R1-GGUF/DeepSeek-R1-GGUF-IQ4_K__IQ3_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = deepseek2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = unsloth_DeepSeek R1 BF16
llama_model_loader: - kv   3:                         general.size_label str              = 256x21B
llama_model_loader: - kv   4:                            general.license str              = mit
llama_model_loader: - kv   5:                   general.base_model.count u32              = 1
llama_model_loader: - kv   6:                  general.base_model.0.name str              = DeepSeek R1
llama_model_loader: - kv   7:          general.base_model.0.organization str              = Deepseek Ai
llama_model_loader: - kv   8:              general.base_model.0.repo_url str              = https://huggingface.co/deepseek-ai/De...
llama_model_loader: - kv   9:                               general.tags arr[str,3]       = ["deepseek", "unsloth", "transformers"]
llama_model_loader: - kv  10:                          general.languages arr[str,1]       = ["en"]
llama_model_loader: - kv  11:                      deepseek2.block_count u32              = 61
llama_model_loader: - kv  12:                   deepseek2.context_length u32              = 163840
llama_model_loader: - kv  13:                 deepseek2.embedding_length u32              = 7168
llama_model_loader: - kv  14:              deepseek2.feed_forward_length u32              = 18432
llama_model_loader: - kv  15:             deepseek2.attention.head_count u32              = 128
llama_model_loader: - kv  16:          deepseek2.attention.head_count_kv u32              = 128
llama_model_loader: - kv  17:                   deepseek2.rope.freq_base f32              = 10000.000000
llama_model_loader: - kv  18: deepseek2.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  19:                deepseek2.expert_used_count u32              = 8
llama_model_loader: - kv  20:                          general.file_type u32              = 18
llama_model_loader: - kv  21:        deepseek2.leading_dense_block_count u32              = 3
llama_model_loader: - kv  22:                       deepseek2.vocab_size u32              = 129280
llama_model_loader: - kv  23:            deepseek2.attention.q_lora_rank u32              = 1536
llama_model_loader: - kv  24:           deepseek2.attention.kv_lora_rank u32              = 512
llama_model_loader: - kv  25:             deepseek2.attention.key_length u32              = 192
llama_model_loader: - kv  26:           deepseek2.attention.value_length u32              = 128
llama_model_loader: - kv  27:       deepseek2.expert_feed_forward_length u32              = 2048
llama_model_loader: - kv  28:                     deepseek2.expert_count u32              = 256
llama_model_loader: - kv  29:              deepseek2.expert_shared_count u32              = 1
llama_model_loader: - kv  30:             deepseek2.expert_weights_scale f32              = 2.500000
llama_model_loader: - kv  31:              deepseek2.expert_weights_norm bool             = true
llama_model_loader: - kv  32:               deepseek2.expert_gating_func u32              = 2
llama_model_loader: - kv  33:             deepseek2.rope.dimension_count u32              = 64
llama_model_loader: - kv  34:                deepseek2.rope.scaling.type str              = yarn
llama_model_loader: - kv  35:              deepseek2.rope.scaling.factor f32              = 40.000000
llama_model_loader: - kv  36: deepseek2.rope.scaling.original_context_length u32              = 4096
llama_model_loader: - kv  37: deepseek2.rope.scaling.yarn_log_multiplier f32              = 0.100000
llama_model_loader: - kv  38:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  39:                         tokenizer.ggml.pre str              = deepseek-v3
llama_model_loader: - kv  40:                      tokenizer.ggml.tokens arr[str,129280]  = ["<｜begin▁of▁sentence｜>", "<<3C>...
llama_model_loader: - kv  41:                  tokenizer.ggml.token_type arr[i32,129280]  = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  42:                      tokenizer.ggml.merges arr[str,127741]  = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv  43:                tokenizer.ggml.bos_token_id u32              = 0
llama_model_loader: - kv  44:                tokenizer.ggml.eos_token_id u32              = 1
llama_model_loader: - kv  45:            tokenizer.ggml.padding_token_id u32              = 128815
llama_model_loader: - kv  46:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  47:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  48:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
llama_model_loader: - kv  49:               general.quantization_version u32              = 2
llama_model_loader: - kv  50:                      quantize.imatrix.file str              = /models/deepseek-config/imatrix.dat
llama_model_loader: - kv  51:                   quantize.imatrix.dataset str              = imatrix-training-full-3
llama_model_loader: - kv  52:             quantize.imatrix.entries_count i32              = 720
llama_model_loader: - kv  53:              quantize.imatrix.chunks_count i32              = 315
llama_model_loader: - type  f32:  361 tensors
llama_model_loader: - type q8_0:   62 tensors
llama_model_loader: - type q5_K:    6 tensors
llama_model_loader: - type q6_K:  550 tensors
llama_model_loader: - type iq3_k:  104 tensors
llama_model_loader: - type iq4_k:   64 tensors
llm_load_vocab: special tokens cache size = 819
llm_load_vocab: token to piece cache size = 0.8223 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = deepseek2
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 129280
llm_load_print_meta: n_merges         = 127741
llm_load_print_meta: vocab_only       = 0
llm_load_print_meta: n_ctx_train      = 163840
llm_load_print_meta: n_embd           = 7168
llm_load_print_meta: n_layer          = 61
llm_load_print_meta: n_head           = 128
llm_load_print_meta: n_head_kv        = 128
llm_load_print_meta: n_rot            = 64
llm_load_print_meta: n_swa            = 0
llm_load_print_meta: n_embd_head_k    = 192
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 1
llm_load_print_meta: n_embd_k_gqa     = 24576
llm_load_print_meta: n_embd_v_gqa     = 16384
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale    = 0.0e+00
llm_load_print_meta: n_ff             = 18432
llm_load_print_meta: n_expert         = 256
llm_load_print_meta: n_expert_used    = 8
llm_load_print_meta: causal attn      = 1
llm_load_print_meta: pooling type     = 0
llm_load_print_meta: rope type        = 0
llm_load_print_meta: rope scaling     = yarn
llm_load_print_meta: freq_base_train  = 10000.0
llm_load_print_meta: freq_scale_train = 0.025
llm_load_print_meta: n_ctx_orig_yarn  = 4096
llm_load_print_meta: rope_finetuned   = unknown
llm_load_print_meta: ssm_d_conv       = 0
llm_load_print_meta: ssm_d_inner      = 0
llm_load_print_meta: ssm_d_state      = 0
llm_load_print_meta: ssm_dt_rank      = 0
llm_load_print_meta: model type       = 671B
llm_load_print_meta: model ftype      = Q6_K
llm_load_print_meta: model params     = 672.050 B
llm_load_print_meta: model size       = 311.346 GiB (3.980 BPW) 
llm_load_print_meta: repeating layers = 309.721 GiB (3.970 BPW, 670.196 B parameters)
llm_load_print_meta: general.name     = unsloth_DeepSeek R1 BF16
llm_load_print_meta: BOS token        = 0 '<｜begin▁of▁sentence｜>'
llm_load_print_meta: EOS token        = 1 '<｜end▁of▁sentence｜>'
llm_load_print_meta: PAD token        = 128815 '<｜PAD▁TOKEN｜>'
llm_load_print_meta: LF token         = 131 'Ä'
llm_load_print_meta: max token length = 256
llm_load_print_meta: n_layer_dense_lead   = 3
llm_load_print_meta: n_lora_q             = 1536
llm_load_print_meta: n_lora_kv            = 512
llm_load_print_meta: n_ff_exp             = 2048
llm_load_print_meta: n_expert_shared      = 1
llm_load_print_meta: expert_weights_scale = 2.5
llm_load_print_meta: expert_weights_norm  = 1
llm_load_print_meta: expert_gating_func   = sigmoid
llm_load_print_meta: rope_yarn_log_mul    = 0.1000
llm_load_tensors: ggml ctx size =    7.94 MiB
Tensor blk.3.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.3.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.4.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_gate_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_down_exps.weight buffer type overriden to CUDA0
Tensor blk.5.ffn_up_exps.weight buffer type overriden to CUDA0
Tensor blk.6.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.6.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.7.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_gate_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_down_exps.weight buffer type overriden to CUDA1
Tensor blk.8.ffn_up_exps.weight buffer type overriden to CUDA1
Tensor blk.9.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.9.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.10.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.11.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_gate_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_down_exps.weight buffer type overriden to CUDA2
Tensor blk.12.ffn_up_exps.weight buffer type overriden to CUDA2
Tensor blk.13.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.13.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.14.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.15.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_gate_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_down_exps.weight buffer type overriden to CUDA3
Tensor blk.16.ffn_up_exps.weight buffer type overriden to CUDA3
Tensor blk.17.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.17.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.18.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.19.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_gate_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_down_exps.weight buffer type overriden to CUDA4
Tensor blk.20.ffn_up_exps.weight buffer type overriden to CUDA4
Tensor blk.21.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.21.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.22.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.23.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_gate_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_down_exps.weight buffer type overriden to CUDA5
Tensor blk.24.ffn_up_exps.weight buffer type overriden to CUDA5
Tensor blk.25.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.25.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.26.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.27.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_gate_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_down_exps.weight buffer type overriden to CUDA6
Tensor blk.28.ffn_up_exps.weight buffer type overriden to CUDA6
Tensor blk.29.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.29.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.30.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.31.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_gate_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_down_exps.weight buffer type overriden to CUDA7
Tensor blk.32.ffn_up_exps.weight buffer type overriden to CUDA7
Tensor blk.33.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.33.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.34.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.35.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_gate_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_down_exps.weight buffer type overriden to CUDA8
Tensor blk.36.ffn_up_exps.weight buffer type overriden to CUDA8
Tensor blk.37.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.37.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.38.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.39.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_gate_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_down_exps.weight buffer type overriden to CUDA9
Tensor blk.40.ffn_up_exps.weight buffer type overriden to CUDA9
Tensor blk.41.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.41.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.42.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.43.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_gate_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_down_exps.weight buffer type overriden to CUDA10
Tensor blk.44.ffn_up_exps.weight buffer type overriden to CUDA10
Tensor blk.45.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.45.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.46.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.47.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_gate_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_down_exps.weight buffer type overriden to CUDA11
Tensor blk.48.ffn_up_exps.weight buffer type overriden to CUDA11
Tensor blk.49.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.49.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.50.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.51.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_gate_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_down_exps.weight buffer type overriden to CUDA12
Tensor blk.52.ffn_up_exps.weight buffer type overriden to CUDA12
Tensor blk.53.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.53.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.54.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.55.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_gate_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_down_exps.weight buffer type overriden to CUDA13
Tensor blk.56.ffn_up_exps.weight buffer type overriden to CUDA13
Tensor blk.57.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.57.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.58.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.59.ffn_up_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_gate_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_down_exps.weight buffer type overriden to CUDA14
Tensor blk.60.ffn_up_exps.weight buffer type overriden to CUDA14
llm_load_tensors: offloading 61 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 62/62 layers to GPU
llm_load_tensors:        CPU buffer size =   938.98 MiB
llm_load_tensors:      CUDA0 buffer size = 21105.69 MiB
llm_load_tensors:      CUDA1 buffer size = 20299.82 MiB
llm_load_tensors:      CUDA2 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA3 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA4 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA5 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA6 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA7 buffer size = 20992.86 MiB
llm_load_tensors:      CUDA8 buffer size = 21195.82 MiB
llm_load_tensors:      CUDA9 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA10 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA11 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA12 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA13 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA14 buffer size = 21195.82 MiB
llm_load_tensors:     CUDA15 buffer size =  1130.89 MiB
....................................................................................................
llama_new_context_with_model: n_ctx      = 2048
llama_new_context_with_model: n_batch    = 2048
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 1
llama_new_context_with_model: mla_attn   = 2
llama_new_context_with_model: attn_max_b = 0
llama_new_context_with_model: fused_moe  = 1
llama_new_context_with_model: ser        = -1, 0
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 0.025
llama_kv_cache_init: layer 0: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 1: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 2: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 3: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 4: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 5: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 6: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 7: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 8: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 9: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 10: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 11: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 12: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 13: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 14: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 15: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 16: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 17: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 18: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 19: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 20: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 21: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 22: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 23: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 24: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 25: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 26: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 27: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 28: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 29: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 30: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 31: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 32: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 33: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 34: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 35: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 36: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 37: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 38: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 39: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 40: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 41: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 42: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 43: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 44: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 45: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 46: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 47: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 48: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 49: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 50: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 51: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 52: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 53: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 54: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 55: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 56: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 57: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 58: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 59: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init: layer 60: n_embd_head_qk_rope = 64, kv_lora_rank = 512
llama_kv_cache_init:      CUDA0 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA1 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA2 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA3 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA4 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA5 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA6 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA7 KV buffer size =     6.75 MiB
llama_kv_cache_init:      CUDA8 KV buffer size =     9.00 MiB
llama_kv_cache_init:      CUDA9 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA10 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA11 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA12 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA13 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA14 KV buffer size =     9.00 MiB
llama_kv_cache_init:     CUDA15 KV buffer size =     4.50 MiB
llama_new_context_with_model: KV self size  =  137.25 MiB, c^KV (f16):  137.25 MiB, kv^T: not used
llama_new_context_with_model:  CUDA_Host  output buffer size =     1.97 MiB
llama_new_context_with_model:      CUDA0 compute buffer size =   455.00 MiB
llama_new_context_with_model:      CUDA1 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA2 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA3 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA4 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA5 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA6 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA7 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA8 compute buffer size =   476.00 MiB
llama_new_context_with_model:      CUDA9 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA10 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA11 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA12 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA13 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA14 compute buffer size =   476.00 MiB
llama_new_context_with_model:     CUDA15 compute buffer size =   476.00 MiB
llama_new_context_with_model:  CUDA_Host compute buffer size =    18.01 MiB
llama_new_context_with_model: graph nodes  = 3487
llama_new_context_with_model: graph splits = 65

system_info: n_threads = 64 / 128 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | 
perplexity: tokenizing the input ..
perplexity: tokenization took 1206.19 ms
perplexity: calculating perplexity over 561 chunks, n_ctx=512, batch_size=2048, n_seq=4
perplexity: 18.19 seconds per pass - ETA 42.52 minutes
[1]2.5586,[2]3.3168,[3]2.3936,[4]2.0039,[5]1.8142,[6]1.6702,[7]1.5775,[8]1.5075,[9]1.4595,[10]1.4201,[11]1.4070,[12]1.4370,[13]1.4496,[14]1.5781,[15]1.7114,[16]1.7697,[17]1.9326,[18]2.0621,[19]2.0268,[20]2.0155,[21]2.1268,[22]2.1007,[23]2.0732,[24]2.0870,[25]2.0591,[26]2.0345,[27]2.0835,[28]2.0909,[29]2.1415,[30]2.1734,[31]2.2082,[32]2.2263,[33]2.2655,[34]2.3099,[35]2.3604,[36]2.4126,[37]2.4474,[38]2.4946,[39]2.5347,[40]2.5951,[41]2.6395,[42]2.6509,[43]2.7003,[44]2.7157,[45]2.7953,[46]2.8467,[47]2.8028,[48]2.7555,[49]2.7335,[50]2.7524,[51]2.7983,[52]2.8127,[53]2.8648,[54]2.8779,[55]2.9093,[56]2.9418,[57]2.9568,[58]2.9947,[59]3.0068,[60]3.0541,[61]3.0963,[62]3.1488,[63]3.1812,[64]3.2268,[65]3.2363,[66]3.2208,[67]3.1983,[68]3.2313,[69]3.2274,[70]3.2453,[71]3.2632,[72]3.2785,[73]3.2926,[74]3.3158,[75]3.2951,[76]3.2468,[77]3.2033,[78]3.1978,[79]3.1752,[80]3.1585,[81]3.1220,[82]3.1263,[83]3.0939,[84]3.0572,[85]3.0226,[86]2.9979,[87]2.9913,[88]2.9620,[89]2.9446,[90]2.9189,[91]2.8900,[92]2.8640,[93]2.8371,[94]2.8118,[95]2.7877,[96]2.7867,[97]2.7933,[98]2.7781,[99]2.7607,[100]2.7637,[101]2.7557,[102]2.7735,[103]2.8013,[104]2.8209,[105]2.8182,[106]2.8415,[107]2.8660,[108]2.8867,[109]2.9204,[110]2.9543,[111]2.9739,[112]2.9475,[113]2.9354,[114]2.9137,[115]2.8968,[116]2.8835,[117]2.8601,[118]2.8389,[119]2.8175,[120]2.7984,[121]2.7821,[122]2.7637,[123]2.7472,[124]2.7279,[125]2.7105,[126]2.6936,[127]2.6807,[128]2.6722,[129]2.6625,[130]2.6503,[131]2.6437,[132]2.6510,[133]2.6603,[134]2.6677,[135]2.6788,[136]2.6953,[137]2.7118,[138]2.7200,[139]2.7321,[140]2.7326,[141]2.7339,[142]2.7328,[143]2.7330,[144]2.7294,[145]2.7202,[146]2.7186,[147]2.7227,[148]2.7223,[149]2.7235,[150]2.7177,[151]2.7155,[152]2.7123,[153]2.7083,[154]2.7085,[155]2.7128,[156]2.7143,[157]2.7202,[158]2.7292,[159]2.7310,[160]2.7400,[161]2.7484,[162]2.7577,[163]2.7627,[164]2.7832,[165]2.8070,[166]2.8242,[167]2.8363,[168]2.8608,[169]2.8836,[170]2.9055,[171]2.9290,[172]2.9125,[173]2.8952,[174]2.8821,[175]2.8691,[176]2.8562,[177]2.8448,[178]2.8317,[179]2.8177,[180]2.8216,[181]2.8357,[182]2.8509,[183]2.8654,[184]2.8795,[185]2.8898,[186]2.9065,[187]2.9222,[188]2.9363,[189]2.9468,[190]2.9471,[191]2.9542,[192]2.9580,[193]2.9635,[194]2.9830,[195]2.9918,[196]3.0051,[197]3.0148,[198]3.0188,[199]3.0241,[200]3.0233,[201]3.0385,[202]3.0337,[203]3.0388,[204]3.0422,[205]3.0420,[206]3.0443,[207]3.0531,[208]3.0634,[209]3.0728,[210]3.0731,[211]3.0685,[212]3.0687,[213]3.0762,[214]3.0784,[215]3.0843,[216]3.0848,[217]3.0807,[218]3.0809,[219]3.0820,[220]3.0810,[221]3.0813,[222]3.0813,[223]3.0817,[224]3.0867,[225]3.0886,[226]3.0803,[227]3.0781,[228]3.0803,[229]3.0848,[230]3.0911,[231]3.0974,[232]3.0887,[233]3.0807,[234]3.0811,[235]3.0794,[236]3.0880,[237]3.0963,[238]3.1062,[239]3.1160,[240]3.1252,[241]3.1364,[242]3.1510,[243]3.1645,[244]3.1728,[245]3.1840,[246]3.1943,[247]3.1934,[248]3.1892,[249]3.1868,[250]3.1800,[251]3.1777,[252]3.1798,[253]3.1834,[254]3.1905,[255]3.1969,[256]3.2006,[257]3.2030,[258]3.2042,[259]3.2075,[260]3.2096,[261]3.2107,[262]3.2097,[263]3.2156,[264]3.2179,[265]3.2182,[266]3.2202,[267]3.2229,[268]3.2268,[269]3.2301,[270]3.2294,[271]3.2280,[272]3.2208,[273]3.2207,[274]3.2138,[275]3.2031,[276]3.1926,[277]3.1942,[278]3.2046,[279]3.2111,[280]3.2190,[281]3.2263,[282]3.2325,[283]3.2389,[284]3.2453,[285]3.2590,[286]3.2612,[287]3.2646,[288]3.2693,[289]3.2719,[290]3.2635,[291]3.2541,[292]3.2530,[293]3.2522,[294]3.2497,[295]3.2474,[296]3.2496,[297]3.2501,[298]3.2552,[299]3.2615,[300]3.2648,[301]3.2686,[302]3.2711,[303]3.2731,[304]3.2725,[305]3.2842,[306]3.2918,[307]3.3028,[308]3.2915,[309]3.2864,[310]3.2767,[311]3.2808,[312]3.2834,[313]3.2905,[314]3.2929,[315]3.2962,[316]3.2976,[317]3.2993,[318]3.2998,[319]3.3001,[320]3.3043,[321]3.3045,[322]3.3063,[323]3.3130,[324]3.3136,[325]3.3189,[326]3.3235,[327]3.3277,[328]3.3307,[329]3.3323,[330]3.3385,[331]3.3425,[332]3.3470,[333]3.3456,[334]3.3455,[335]3.3460,[336]3.3460,[337]3.3469,[338]3.3473,[339]3.3498,[340]3.3535,[341]3.3589,[342]3.3677,[343]3.3770,[344]3.3823,[345]3.3740,[346]3.3661,[347]3.3608,[348]3.3532,[349]3.3494,[350]3.3475,[351]3.3521,[352]3.3673,[353]3.3764,[354]3.3896,[355]3.3983,[356]3.4036,[357]3.4156,[358]3.4254,[359]3.4285,[360]3.4348,[361]3.4443,[362]3.4531,[363]3.4589,[364]3.4653,[365]3.4718,[366]3.4826,[367]3.4915,[368]3.4983,[369]3.5063,[370]3.5149,[371]3.5286,[372]3.5377,[373]3.5409,[374]3.5444,[375]3.5493,[376]3.5624,[377]3.5738,[378]3.5767,[379]3.5761,[380]3.5727,
[381]3.5775,[382]3.5833,[383]3.5868,[384]3.5911,[385]3.5948,[386]3.6008,[387]3.6066,[388]3.6099,[389]3.5991,[390]3.5896,[391]3.5787,[392]3.5730,[393]3.5635,[394]3.5542,[395]3.5450,[396]3.5347,[397]3.5257,[398]3.5160,[399]3.5056,[400]3.4975,[401]3.4874,[402]3.4768,[403]3.4678,[404]3.4573,[405]3.4476,[406]3.4375,[407]3.4281,[408]3.4192,[409]3.4103,[410]3.4041,[411]3.4049,[412]3.4004,[413]3.4022,[414]3.4041,[415]3.4011,[416]3.4013,[417]3.4034,[418]3.3976,[419]3.3989,[420]3.3964,[421]3.3953,[422]3.3969,[423]3.3962,[424]3.4001,[425]3.3995,[426]3.4001,[427]3.3991,[428]3.4014,[429]3.4032,[430]3.4061,[431]3.4070,[432]3.4063,[433]3.4025,[434]3.4028,[435]3.3954,[436]3.3891,[437]3.3850,[438]3.3831,[439]3.3805,[440]3.3854,[441]3.3908,[442]3.3981,[443]3.3963,[444]3.3969,[445]3.3980,[446]3.4029,[447]3.4060,[448]3.4086,[449]3.4116,[450]3.4154,[451]3.4183,[452]3.4205,[453]3.4224,[454]3.4208,[455]3.4229,[456]3.4232,[457]3.4260,[458]3.4311,[459]3.4317,[460]3.4318,[461]3.4283,[462]3.4320,[463]3.4395,[464]3.4450,[465]3.4379,[466]3.4361,[467]3.4345,[468]3.4359,[469]3.4330,[470]3.4301,[471]3.4306,[472]3.4314,[473]3.4306,[474]3.4296,[475]3.4307,[476]3.4293,[477]3.4283,[478]3.4291,[479]3.4308,[480]3.4335,[481]3.4294,[482]3.4328,[483]3.4319,[484]3.4355,[485]3.4418,[486]3.4448,[487]3.4487,[488]3.4541,[489]3.4566,[490]3.4610,[491]3.4673,[492]3.4718,[493]3.4714,[494]3.4726,[495]3.4751,[496]3.4770,[497]3.4799,[498]3.4802,[499]3.4795,[500]3.4835,[501]3.4880,[502]3.4873,[503]3.4858,[504]3.4878,[505]3.4910,[506]3.4996,[507]3.5024,[508]3.5058,[509]3.4982,[510]3.4929,[511]3.4865,[512]3.4821,[513]3.4759,[514]3.4746,[515]3.4771,[516]3.4721,[517]3.4719,[518]3.4709,[519]3.4716,[520]3.4765,[521]3.4751,[522]3.4736,[523]3.4794,[524]3.4783,[525]3.4766,[526]3.4719,[527]3.4668,[528]3.4634,[529]3.4601,[530]3.4570,[531]3.4539,[532]3.4481,[533]3.4416,[534]3.4376,[535]3.4384,[536]3.4414,[537]3.4444,[538]3.4472,[539]3.4499,[540]3.4553,[541]3.4589,[542]3.4613,[543]3.4555,[544]3.4514,[545]3.4510,[546]3.4443,[547]3.4380,[548]3.4314,[549]3.4247,[550]3.4187,[551]3.4125,[552]3.4067,[553]3.4010,[554]3.3990,[555]3.3976,[556]3.4004,[557]3.4045,[558]3.4104,[559]3.4150,[560]3.4202,[561]3.4184,
Final estimate: PPL = 3.4184 +/- 0.01902

298 KiB Raw Permalink Blame History Unescape Escape

🐛 #262 - Fix #261

💬 Conversation

298 KiB

Raw Permalink Blame History