From 543c4b27714c6a677a6e714c452d62532f634cff Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sun, 6 Apr 2025 14:42:49 +0200
Subject: [PATCH] Initial commit
---
MANIFEST.in | 1 +
README.md | 112 +
convert.py | 11 +
doc/cat.png | Bin 0 -> 17301 bytes
doc/exl3.md | 77 +
doc/gumbel_eval.png | Bin 0 -> 32066 bytes
doc/humaneval.png | Bin 0 -> 72344 bytes
doc/llama31_70b_instruct_bpw.png | Bin 0 -> 171124 bytes
doc/llama31_70b_instruct_vram.png | Bin 0 -> 167875 bytes
doc/llama31_8b_instruct_bpw.png | Bin 0 -> 185643 bytes
doc/llama31_8b_instruct_vram.png | Bin 0 -> 180720 bytes
doc/llama32_1b_instruct_bpw.png | Bin 0 -> 152122 bytes
doc/llama32_1b_instruct_vram.png | Bin 0 -> 157468 bytes
doc/mistral_7b_instruct_v0.3_bpw.png | Bin 0 -> 142966 bytes
doc/mistral_7b_instruct_v0.3_vram.png | Bin 0 -> 143386 bytes
doc/procedural_codebook.png | Bin 0 -> 419829 bytes
eval/compare_q.py | 264 +
eval/compare_q_exllamav2.py | 34 +
eval/compare_q_exllamav3.py | 38 +
eval/compare_q_llamacpp.py | 61 +
eval/compare_q_transformers.py | 106 +
eval/humaneval.py | 168 +
eval/model_diff.py | 95 +
eval/ppl.py | 92 +
eval/spec/llama3.1-70b-instruct_aqlm.json | 8 +
eval/spec/llama3.1-70b-instruct_awq.json | 8 +
eval/spec/llama3.1-70b-instruct_exl2.json | 44 +
eval/spec/llama3.1-70b-instruct_exl3.json | 76 +
eval/spec/llama3.1-70b-instruct_gguf.json | 62 +
eval/spec/llama3.1-70b-instruct_vptq.json | 14 +
eval/spec/llama3.1-8b-instruct_aqlm.json | 20 +
eval/spec/llama3.1-8b-instruct_exl2.json | 44 +
eval/spec/llama3.1-8b-instruct_exl3.json | 74 +
eval/spec/llama3.1-8b-instruct_gguf.json | 74 +
eval/spec/llama3.1-8b-instruct_hf.json | 8 +
eval/spec/llama3.1-8b-instruct_vptq.json | 26 +
eval/spec/llama3.2-1b-instruct_aqlm.json | 8 +
eval/spec/llama3.2-1b-instruct_awq.json | 8 +
eval/spec/llama3.2-1b-instruct_bnb.json | 8 +
eval/spec/llama3.2-1b-instruct_exl2.json | 50 +
eval/spec/llama3.2-1b-instruct_exl3.json | 62 +
eval/spec/llama3.2-1b-instruct_gguf.json | 53 +
eval/spec/mistral-7b-instruct-v0.3_awq.json | 8 +
eval/spec/mistral-7b-instruct-v0.3_exl2.json | 38 +
eval/spec/mistral-7b-instruct-v0.3_exl3.json | 56 +
eval/spec/mistral-7b-instruct-v0.3_gguf.json | 50 +
eval/spec/wiki2_llama3.json | 8 +
eval/spec/wiki2_llama3_large.json | 8 +
eval/spec/wiki2_mistral_large.json | 8 +
examples/async_generator.py | 89 +
examples/banned_strings.py | 105 +
examples/chat.py | 112 +
examples/chat_templates.py | 209 +
examples/common.py | 67 +
examples/dynamic_gen.py | 248 +
examples/generation_loop.py | 91 +
examples/generator.py | 186 +
exllamav3/__init__.py | 6 +
exllamav3/cache/__init__.py | 2 +
exllamav3/cache/cache.py | 119 +
exllamav3/cache/fp16.py | 54 +
exllamav3/constants.py | 1 +
exllamav3/conversion/__init__.py | 0
exllamav3/conversion/allocation.py | 97 +
exllamav3/conversion/calibration_data.py | 92 +
exllamav3/conversion/compile.py | 137 +
exllamav3/conversion/convert_model.py | 356 +
.../conversion/standard_cal_data/c4.utf8 | 118 +
.../conversion/standard_cal_data/code.utf8 | 8225 +++++++
.../standard_cal_data/multilingual.utf8 | 525 +
.../standard_cal_data/technical.utf8 | 988 +
.../conversion/standard_cal_data/tiny.utf8 | 2240 ++
.../conversion/standard_cal_data/wiki.utf8 | 20217 ++++++++++++++++
exllamav3/device.py | 44 +
exllamav3/exllamav3_ext/activation.cu | 117 +
exllamav3/exllamav3_ext/activation.cuh | 17 +
exllamav3/exllamav3_ext/bindings.cpp | 59 +
exllamav3/exllamav3_ext/compat.cuh | 29 +
exllamav3/exllamav3_ext/generator/gumbel.cu | 162 +
exllamav3/exllamav3_ext/generator/gumbel.cuh | 24 +
.../exllamav3_ext/generator/sampling_basic.cu | 197 +
.../generator/sampling_basic.cuh | 18 +
exllamav3/exllamav3_ext/generator/strings.cpp | 72 +
exllamav3/exllamav3_ext/generator/strings.h | 24 +
exllamav3/exllamav3_ext/hadamard.cpp | 112 +
exllamav3/exllamav3_ext/hadamard.h | 13 +
exllamav3/exllamav3_ext/hgemm.cu | 57 +
exllamav3/exllamav3_ext/hgemm.cuh | 10 +
exllamav3/exllamav3_ext/norm.cu | 236 +
exllamav3/exllamav3_ext/norm.cuh | 12 +
exllamav3/exllamav3_ext/ptx.cuh | 246 +
exllamav3/exllamav3_ext/quant/codebook.cuh | 89 +
exllamav3/exllamav3_ext/quant/exl3_dq.cuh | 255 +
exllamav3/exllamav3_ext/quant/exl3_gemm.cu | 293 +
exllamav3/exllamav3_ext/quant/exl3_gemm.cuh | 14 +
.../exllamav3_ext/quant/exl3_gemm_kernel.cuh | 620 +
exllamav3/exllamav3_ext/quant/hadamard.cu | 210 +
exllamav3/exllamav3_ext/quant/hadamard.cuh | 12 +
exllamav3/exllamav3_ext/quant/pack.cu | 220 +
exllamav3/exllamav3_ext/quant/pack.cuh | 23 +
exllamav3/exllamav3_ext/quant/quantize.cu | 328 +
exllamav3/exllamav3_ext/quant/quantize.cuh | 19 +
exllamav3/exllamav3_ext/quant/reconstruct.cu | 125 +
exllamav3/exllamav3_ext/quant/reconstruct.cuh | 10 +
exllamav3/exllamav3_ext/reduction.cuh | 104 +
exllamav3/exllamav3_ext/rope.cu | 197 +
exllamav3/exllamav3_ext/rope.cuh | 17 +
exllamav3/exllamav3_ext/softcap.cu | 95 +
exllamav3/exllamav3_ext/softcap.cuh | 10 +
exllamav3/exllamav3_ext/stloader.cpp | 178 +
exllamav3/exllamav3_ext/stloader.h | 18 +
exllamav3/exllamav3_ext/util.cuh | 97 +
exllamav3/exllamav3_ext/util.h | 118 +
exllamav3/ext.py | 134 +
exllamav3/generator/__init__.py | 3 +
exllamav3/generator/async_generator.py | 94 +
exllamav3/generator/generator.py | 713 +
exllamav3/generator/job.py | 897 +
exllamav3/generator/pagetable.py | 361 +
exllamav3/generator/sampler/__init__.py | 23 +
exllamav3/generator/sampler/custom.py | 291 +
exllamav3/generator/sampler/presets.py | 80 +
exllamav3/generator/sampler/sampler.py | 20 +
exllamav3/loader/__init__.py | 1 +
exllamav3/loader/safetensors.py | 316 +
exllamav3/model_init.py | 91 +
exllamav3/models/__init__.py | 3 +
exllamav3/models/architectures.py | 37 +
exllamav3/models/config.py | 168 +
exllamav3/models/gemma.py | 167 +
exllamav3/models/llama.py | 142 +
exllamav3/models/mistral.py | 27 +
exllamav3/models/model.py | 361 +
exllamav3/models/phi3.py | 149 +
exllamav3/models/qwen2.py | 28 +
exllamav3/modules/__init__.py | 7 +
exllamav3/modules/attn.py | 370 +
exllamav3/modules/embedding.py | 77 +
exllamav3/modules/linear.py | 213 +
exllamav3/modules/mlp.py | 104 +
exllamav3/modules/module.py | 87 +
exllamav3/modules/quant/__init__.py | 3 +
exllamav3/modules/quant/exl3.py | 132 +
exllamav3/modules/quant/exl3_lib/__init__.py | 1 +
exllamav3/modules/quant/exl3_lib/quantize.py | 582 +
exllamav3/modules/quant/fp16.py | 115 +
exllamav3/modules/rmsnorm.py | 81 +
exllamav3/modules/transformer.py | 81 +
exllamav3/tokenizer/__init__.py | 1 +
exllamav3/tokenizer/tokenizer.py | 611 +
exllamav3/util/__init__.py | 1 +
exllamav3/util/arch_list.py | 39 +
exllamav3/util/file.py | 116 +
exllamav3/util/hadamard.py | 137 +
exllamav3/util/hadamard_data/hadamard_1.txt | 1 +
exllamav3/util/hadamard_data/hadamard_100.txt | 100 +
exllamav3/util/hadamard_data/hadamard_116.txt | 116 +
exllamav3/util/hadamard_data/hadamard_156.txt | 156 +
exllamav3/util/hadamard_data/hadamard_172.txt | 172 +
exllamav3/util/hadamard_data/hadamard_188.txt | 188 +
exllamav3/util/hadamard_data/hadamard_236.txt | 236 +
exllamav3/util/hadamard_data/hadamard_244.txt | 244 +
exllamav3/util/hadamard_data/hadamard_428.txt | 428 +
exllamav3/util/hadamard_data/hadamard_52.txt | 52 +
exllamav3/util/hadamard_data/hadamard_92.txt | 92 +
exllamav3/util/hadamard_data/primes.txt | 10000 ++++++++
exllamav3/util/memory.py | 168 +
exllamav3/util/misc.py | 45 +
exllamav3/util/progress.py | 36 +
exllamav3/util/rope.py | 308 +
exllamav3/util/tensor.py | 147 +
exllamav3/version.py | 1 +
requirements.txt | 7 +
requirements_eval.txt | 12 +
requirements_examples.txt | 1 +
science/codebook_eval.py | 46 +
science/gumbel_eval.py | 68 +
science/qgemm_benchmark.py | 125 +
setup.py | 108 +
tests/generator_stresstest.py | 111 +
tests/test_ext_norm.py | 69 +
tests/test_qgemm.py | 58 +
tests/test_quant_fn.py | 159 +
tests/test_rope.py | 77 +
tests/test_sampler.py | 104 +
tests/util.py | 51 +
186 files changed, 61017 insertions(+)
create mode 100644 MANIFEST.in
create mode 100644 README.md
create mode 100644 convert.py
create mode 100644 doc/cat.png
create mode 100644 doc/exl3.md
create mode 100644 doc/gumbel_eval.png
create mode 100644 doc/humaneval.png
create mode 100644 doc/llama31_70b_instruct_bpw.png
create mode 100644 doc/llama31_70b_instruct_vram.png
create mode 100644 doc/llama31_8b_instruct_bpw.png
create mode 100644 doc/llama31_8b_instruct_vram.png
create mode 100644 doc/llama32_1b_instruct_bpw.png
create mode 100644 doc/llama32_1b_instruct_vram.png
create mode 100644 doc/mistral_7b_instruct_v0.3_bpw.png
create mode 100644 doc/mistral_7b_instruct_v0.3_vram.png
create mode 100644 doc/procedural_codebook.png
create mode 100644 eval/compare_q.py
create mode 100644 eval/compare_q_exllamav2.py
create mode 100644 eval/compare_q_exllamav3.py
create mode 100644 eval/compare_q_llamacpp.py
create mode 100644 eval/compare_q_transformers.py
create mode 100644 eval/humaneval.py
create mode 100644 eval/model_diff.py
create mode 100644 eval/ppl.py
create mode 100644 eval/spec/llama3.1-70b-instruct_aqlm.json
create mode 100644 eval/spec/llama3.1-70b-instruct_awq.json
create mode 100644 eval/spec/llama3.1-70b-instruct_exl2.json
create mode 100644 eval/spec/llama3.1-70b-instruct_exl3.json
create mode 100644 eval/spec/llama3.1-70b-instruct_gguf.json
create mode 100644 eval/spec/llama3.1-70b-instruct_vptq.json
create mode 100644 eval/spec/llama3.1-8b-instruct_aqlm.json
create mode 100644 eval/spec/llama3.1-8b-instruct_exl2.json
create mode 100644 eval/spec/llama3.1-8b-instruct_exl3.json
create mode 100644 eval/spec/llama3.1-8b-instruct_gguf.json
create mode 100644 eval/spec/llama3.1-8b-instruct_hf.json
create mode 100644 eval/spec/llama3.1-8b-instruct_vptq.json
create mode 100644 eval/spec/llama3.2-1b-instruct_aqlm.json
create mode 100644 eval/spec/llama3.2-1b-instruct_awq.json
create mode 100644 eval/spec/llama3.2-1b-instruct_bnb.json
create mode 100644 eval/spec/llama3.2-1b-instruct_exl2.json
create mode 100644 eval/spec/llama3.2-1b-instruct_exl3.json
create mode 100644 eval/spec/llama3.2-1b-instruct_gguf.json
create mode 100644 eval/spec/mistral-7b-instruct-v0.3_awq.json
create mode 100644 eval/spec/mistral-7b-instruct-v0.3_exl2.json
create mode 100644 eval/spec/mistral-7b-instruct-v0.3_exl3.json
create mode 100644 eval/spec/mistral-7b-instruct-v0.3_gguf.json
create mode 100644 eval/spec/wiki2_llama3.json
create mode 100644 eval/spec/wiki2_llama3_large.json
create mode 100644 eval/spec/wiki2_mistral_large.json
create mode 100644 examples/async_generator.py
create mode 100644 examples/banned_strings.py
create mode 100644 examples/chat.py
create mode 100644 examples/chat_templates.py
create mode 100644 examples/common.py
create mode 100644 examples/dynamic_gen.py
create mode 100644 examples/generation_loop.py
create mode 100644 examples/generator.py
create mode 100644 exllamav3/__init__.py
create mode 100644 exllamav3/cache/__init__.py
create mode 100644 exllamav3/cache/cache.py
create mode 100644 exllamav3/cache/fp16.py
create mode 100644 exllamav3/constants.py
create mode 100644 exllamav3/conversion/__init__.py
create mode 100644 exllamav3/conversion/allocation.py
create mode 100644 exllamav3/conversion/calibration_data.py
create mode 100644 exllamav3/conversion/compile.py
create mode 100644 exllamav3/conversion/convert_model.py
create mode 100644 exllamav3/conversion/standard_cal_data/c4.utf8
create mode 100644 exllamav3/conversion/standard_cal_data/code.utf8
create mode 100644 exllamav3/conversion/standard_cal_data/multilingual.utf8
create mode 100644 exllamav3/conversion/standard_cal_data/technical.utf8
create mode 100644 exllamav3/conversion/standard_cal_data/tiny.utf8
create mode 100644 exllamav3/conversion/standard_cal_data/wiki.utf8
create mode 100644 exllamav3/device.py
create mode 100644 exllamav3/exllamav3_ext/activation.cu
create mode 100644 exllamav3/exllamav3_ext/activation.cuh
create mode 100644 exllamav3/exllamav3_ext/bindings.cpp
create mode 100644 exllamav3/exllamav3_ext/compat.cuh
create mode 100644 exllamav3/exllamav3_ext/generator/gumbel.cu
create mode 100644 exllamav3/exllamav3_ext/generator/gumbel.cuh
create mode 100644 exllamav3/exllamav3_ext/generator/sampling_basic.cu
create mode 100644 exllamav3/exllamav3_ext/generator/sampling_basic.cuh
create mode 100644 exllamav3/exllamav3_ext/generator/strings.cpp
create mode 100644 exllamav3/exllamav3_ext/generator/strings.h
create mode 100644 exllamav3/exllamav3_ext/hadamard.cpp
create mode 100644 exllamav3/exllamav3_ext/hadamard.h
create mode 100644 exllamav3/exllamav3_ext/hgemm.cu
create mode 100644 exllamav3/exllamav3_ext/hgemm.cuh
create mode 100644 exllamav3/exllamav3_ext/norm.cu
create mode 100644 exllamav3/exllamav3_ext/norm.cuh
create mode 100644 exllamav3/exllamav3_ext/ptx.cuh
create mode 100644 exllamav3/exllamav3_ext/quant/codebook.cuh
create mode 100644 exllamav3/exllamav3_ext/quant/exl3_dq.cuh
create mode 100644 exllamav3/exllamav3_ext/quant/exl3_gemm.cu
create mode 100644 exllamav3/exllamav3_ext/quant/exl3_gemm.cuh
create mode 100644 exllamav3/exllamav3_ext/quant/exl3_gemm_kernel.cuh
create mode 100644 exllamav3/exllamav3_ext/quant/hadamard.cu
create mode 100644 exllamav3/exllamav3_ext/quant/hadamard.cuh
create mode 100644 exllamav3/exllamav3_ext/quant/pack.cu
create mode 100644 exllamav3/exllamav3_ext/quant/pack.cuh
create mode 100644 exllamav3/exllamav3_ext/quant/quantize.cu
create mode 100644 exllamav3/exllamav3_ext/quant/quantize.cuh
create mode 100644 exllamav3/exllamav3_ext/quant/reconstruct.cu
create mode 100644 exllamav3/exllamav3_ext/quant/reconstruct.cuh
create mode 100644 exllamav3/exllamav3_ext/reduction.cuh
create mode 100644 exllamav3/exllamav3_ext/rope.cu
create mode 100644 exllamav3/exllamav3_ext/rope.cuh
create mode 100644 exllamav3/exllamav3_ext/softcap.cu
create mode 100644 exllamav3/exllamav3_ext/softcap.cuh
create mode 100644 exllamav3/exllamav3_ext/stloader.cpp
create mode 100644 exllamav3/exllamav3_ext/stloader.h
create mode 100644 exllamav3/exllamav3_ext/util.cuh
create mode 100644 exllamav3/exllamav3_ext/util.h
create mode 100644 exllamav3/ext.py
create mode 100644 exllamav3/generator/__init__.py
create mode 100644 exllamav3/generator/async_generator.py
create mode 100644 exllamav3/generator/generator.py
create mode 100644 exllamav3/generator/job.py
create mode 100644 exllamav3/generator/pagetable.py
create mode 100644 exllamav3/generator/sampler/__init__.py
create mode 100644 exllamav3/generator/sampler/custom.py
create mode 100644 exllamav3/generator/sampler/presets.py
create mode 100644 exllamav3/generator/sampler/sampler.py
create mode 100644 exllamav3/loader/__init__.py
create mode 100644 exllamav3/loader/safetensors.py
create mode 100755 exllamav3/model_init.py
create mode 100644 exllamav3/models/__init__.py
create mode 100644 exllamav3/models/architectures.py
create mode 100644 exllamav3/models/config.py
create mode 100644 exllamav3/models/gemma.py
create mode 100644 exllamav3/models/llama.py
create mode 100644 exllamav3/models/mistral.py
create mode 100644 exllamav3/models/model.py
create mode 100644 exllamav3/models/phi3.py
create mode 100644 exllamav3/models/qwen2.py
create mode 100644 exllamav3/modules/__init__.py
create mode 100644 exllamav3/modules/attn.py
create mode 100644 exllamav3/modules/embedding.py
create mode 100644 exllamav3/modules/linear.py
create mode 100644 exllamav3/modules/mlp.py
create mode 100644 exllamav3/modules/module.py
create mode 100644 exllamav3/modules/quant/__init__.py
create mode 100644 exllamav3/modules/quant/exl3.py
create mode 100644 exllamav3/modules/quant/exl3_lib/__init__.py
create mode 100644 exllamav3/modules/quant/exl3_lib/quantize.py
create mode 100644 exllamav3/modules/quant/fp16.py
create mode 100644 exllamav3/modules/rmsnorm.py
create mode 100644 exllamav3/modules/transformer.py
create mode 100644 exllamav3/tokenizer/__init__.py
create mode 100644 exllamav3/tokenizer/tokenizer.py
create mode 100644 exllamav3/util/__init__.py
create mode 100644 exllamav3/util/arch_list.py
create mode 100644 exllamav3/util/file.py
create mode 100644 exllamav3/util/hadamard.py
create mode 100644 exllamav3/util/hadamard_data/hadamard_1.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_100.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_116.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_156.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_172.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_188.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_236.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_244.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_428.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_52.txt
create mode 100644 exllamav3/util/hadamard_data/hadamard_92.txt
create mode 100644 exllamav3/util/hadamard_data/primes.txt
create mode 100644 exllamav3/util/memory.py
create mode 100644 exllamav3/util/misc.py
create mode 100644 exllamav3/util/progress.py
create mode 100644 exllamav3/util/rope.py
create mode 100644 exllamav3/util/tensor.py
create mode 100644 exllamav3/version.py
create mode 100644 requirements.txt
create mode 100644 requirements_eval.txt
create mode 100644 requirements_examples.txt
create mode 100644 science/codebook_eval.py
create mode 100644 science/gumbel_eval.py
create mode 100644 science/qgemm_benchmark.py
create mode 100644 setup.py
create mode 100644 tests/generator_stresstest.py
create mode 100644 tests/test_ext_norm.py
create mode 100644 tests/test_qgemm.py
create mode 100644 tests/test_quant_fn.py
create mode 100644 tests/test_rope.py
create mode 100644 tests/test_sampler.py
create mode 100644 tests/util.py
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..9ec626d
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include exllamav3/util/hadamard_data/*
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e45ac58
--- /dev/null
+++ b/README.md
@@ -0,0 +1,112 @@
+
+# ExlLlamaV3
+
+This is an **early preview release** of ExLlamaV3. Please note: ↙
+
+- The framework is not yet fully optimized. Performance is lacking, especially on Ampere, and there may be a significant CPU bottleneck on slower processors until the extension functions are fully built out.
+- AMD GPUs (ROCm) are not yet supported.
+- [FlashAttention-2](https://github.com/Dao-AILab/flash-attention) is currently required. I hope to switch over to [FlashInfer](https://github.com/flashinfer-ai/flashinfer/tree/main) in time, but there are some obstacles to overcome first.
+- A number of important features are yet to be added, such as cache quantization, tensor parallelism and multimodal support.
+- There are no release builds yet.
+- Integration into [TabbyAPI](https://github.com/theroyallab/tabbyAPI/) is planned when all the core functionality is in place.
+
+### Why?
+
+As the name implies, the original intention for ExLlama was to run inference on quantized Llama models. ExLlamaV2 was able to support a number of other architectures by treating every new model as (more or less) a Llama variant with optional features. However, as new models are increasingly moving away from the basic transformer template, this approach is no longer sustainable.
+
+Additionally, ExLlamaV2 is largely designed to run in a single process and CUDA doesn't like this very much when spreading a workload across multiple GPUs. It's a fundamental design feature in the CUDA runtime, and it has become a major obstacle to tensor-parallel inference, demand for which seems to keep increasing. This shortcoming is not easily addressed without a rewrite. Moreover, the **EXL2** format doesn't lend itself well to parallel inference in the first place due its input channel permutation.
+
+ Aside from lifting a few of the most successful features from V2 (such as the generator), ExLlamaV3 is largely rewritten from scratch to provide a cleaner, more modular framework for supporting newer architectures. It also introduces a new SOTA quantization format based on [**QTIP**](https://github.com/Cornell-RelaxML/qtip) (see below).
+
+### What's missing?
+
+There's much that still needs to be added and/or ported over from ExLlamaV2. I've decided to release ExLlamaV3 in its current state to invite testing, feedback and contributions, but please be aware that it's not yet a viable replacement for ExLlamaV2. Currently on the to-do list:
+
+- Support for more architectures (Mixtral, Cohere and Deepseek are in the works)
+- Samplers (most notably repetition penalties and min-P are missing)
+- Constrained sampling (JSON filters etc.)
+- Multimodal support
+- Cache quantization
+- LoRA support
+- ROCm support
+- Tensor-parallel inference
+- Lots of optimization
+
+As for what is implemented, expect that some things may be a little broken at first. Please be patient and/or contribute. 👉👈
+
+### How to?
+
+#### Installation
+
+Detailed installation instructions are coming soon, along with prebuilt wheels. For the time being, you can install the library with:
+
+```sh
+# Full installation
+pip install -r requirements.txt
+pip install .
+
+# JIT mode
+EXLLAMA_NOCOMPILE=1 pip install .
+```
+
+Note that the included scripts can run in JIT mode from the repo directory without installing the library.
+
+#### Conversion
+
+To convert a model to EXL3 format, use:
+
+```sh
+# Convert model
+python convert.py -i -o -w -b
+
+# Resume an interrupted quant job
+convert.py -w -r
+
+# More options
+convert.py -h
+```
+
+The working directory is temporary storage for state checkpoints and for storing quantized tensors until the converted model can be compiled. It should have enough free space to store an entire copy of the output model. Note that while EXL2 conversion by default resumes an interrupted job when pointed to an existing folder, EXL3 needs you to explicitly resume with the `-r`/`--resume` argument.
+
+#### Examples
+
+A number of example scripts are provided to showcase the features of the backend and generator. Some of them have hardcoded model paths and should be edited before you run them, but there is a simple CLI chatbot that you can start with:
+
+```sh
+python examples/chat.py -m -mode
+
+# E.g.:
+python examples/chat.py -m /mnt/models/llama3.1-8b-instruct-exl3 -mode llama3
+```
+
+### EXL3 quantization
+
+
+
+
+
+
+
+Despite their amazing achievements, most SOTA quantization techniques remain cumbersome or even prohibitively expensive to use. For instance, **AQLM** quantization of a 70B model takes around **720 GPU-hours** on an A100 server, costing $850 US at the time of writing. ExLlamaV3 aims to address this with **EXL3** format, which is a streamlined variant of [**QTIP**](https://github.com/Cornell-RelaxML/qtip) from Cornell RelaxML. The conversion process is designed to be simple and efficient and requires only an input model (in HF format) and a target bitrate. By computing Hessians on the fly and thanks to a fused Viterbi kernel, the quantizer can convert a model in a single step, taking a couple of minutes for a smaller models, up to a few hours for larger ones (70B+) (on a single RTX 4090 or equivalent GPU.)
+
+The [Marlin](https://github.com/IST-DASLab/marlin)-inspired GEMM kernel achieves roughly memory-bound latency under optimal conditions (4bpw, RTX 4090), though it still needs some work to achieve the same efficiency on Ampere GPUs and to remain memory-bound at lower bitrates.
+
+Since converted models largely retain the original file structure (unlike **EXL2** which renames some tensors in its quest to turn every model into a Llama variant), it will be possible to extend **EXL3** support to other frameworks like HF Transformers and vLLM.
+
+There are some benchmark results [here](doc/exl3.md), and a full writeup on the format is coming soon.
+
+Fun fact: Llama-3.1-70B-EXL3 is coherent at 1.6 bpw. With the output layer quantized to 3 bpw and a 4096-token cache, inference is possible in under 16 GB of VRAM.
+
+A selection of EXL3-quantized models is available on [🤗 Hugging Face](https://huggingface.co/turboderp).
+
+
+### Acknowledgements
+
+This project owes its existence to a wonderful community of FOSS developers and some very generous supporters (🐈❤️!) The following projects in particular deserve a special mention:
+
+- [TabbyAPI](https://github.com/theroyallab/tabbyAPI/)
+- [PyTorch](https://github.com/pytorch/pytorch)
+- [FlashAttention](https://github.com/Dao-AILab/flash-attention)
+- [QTIP](https://github.com/Cornell-RelaxML/qtip)
+- [Transformers](https://github.com/huggingface/transformers)
+- [Marlin](https://github.com/IST-DASLab/marlin)
\ No newline at end of file
diff --git a/convert.py b/convert.py
new file mode 100644
index 0000000..5381f4a
--- /dev/null
+++ b/convert.py
@@ -0,0 +1,11 @@
+from exllamav3.conversion.convert_model import parser, main, prepare
+
+# Script included in package: ./exllamav3/conversion/convert_model.py
+
+if __name__ == "__main__":
+ _args = parser.parse_args()
+ _in_args, _job_state, _ok, _err = prepare(_args)
+ if not _ok:
+ print(f" !! Error: {_err}")
+ else:
+ main(_in_args, _job_state)
\ No newline at end of file
diff --git a/doc/cat.png b/doc/cat.png
new file mode 100644
index 0000000000000000000000000000000000000000..8a8752b930a3796f00085a847d86c55f87e51547
GIT binary patch
literal 17301
zcmV+2Kq9}1P)EX>4Tx04R}tkvT{MK@^5R;}H?X!W1GFTd0+3RKzkyK?H>mBgCU{cViC7Zo+PY
zjiq2KSa>vEg^jhg7Pf*Qc!1aoR*E(v>zgEyAjW}VKK_~i@!mh+JItDv+Y|!JE;!Mt
zuohof)ch|r(Si>M>ZY3?k3?puyFTw|Q0Hw>)vEvQ&()Dwbxml{ghOWDDG09!kFFK+
zig$$rW?D}Op9v2-(xC8z(v32|Do&MHwM@W?MJI#XD;*YWq1{1I^(;>uu(
zW0D*eddZLagWuV@$>E`LrS>p1KYQMLKlr!dd&~2_jy&)C7#g48(ysb*hcNab`c_rz
zJAlp&IKQfD#tvL;Lib}{cl46fDBG7z!rQ5Q(*kteK-*F|x5_;y_Yjk;X0Bmx9|lrV
zd%4cst+l!R`=*lL4=z)3i-eW@?f?J)24YJ`L;(K){{a7>y{D4^000SaNLh0L04^f{
z04^f|c%?sf00007bV*G`2k8U`3IixUduJH{000?uMObu0Z*6U5Zgc=ca%Ew3Wn>_C
zX>@2HM@dakSAh-}001BWNkl?=Nx-F=TO~M
z=j{FdIQ#6gPtB05K@+z>W;8wBRn_OLv%+uv)^7=Z#@X=Y2NnUZ2W|mY0<(b%dR@0r67|WgT!Y@UAP@^|Ctm1NYr_&s`gTE`!m6md&O9;oS$+k
zjg24-&a(f?HMw$rj(-1BzECdL^W9pO}OH)9jyv@|qPSyoO>
zRW+sAEajzTS8mgVb8bly!u>D%@qM4?wr}R)XE$)T>nH`kfRKO|D5YqrYo@iKnYpcP
ztX*|Ab6e-&dES*N{G6E-=b@3f_sBuM{N1mE6iH)2ppBJ8#-g=GYuLJf
z8(a5pr=zKzSKn|eH(h-l<)vi^AuhVBuC+!fMcyyaJJ3&8Uk`&L!yG?-65sa$Xq;8g
zviXZx)H$EBZ0UvadzOd|Uw%NptR|Yz^XV`A9ea-NLu(CMvt;2y0;TBc?qzgrgjC8y
z2x+ZGqvH!^46Lo8jW@2plWUf*x@hv8sa%fJeLWmMeTp3i_p*84HVS?or4$H(K$zdw
zcvhgLzJ=eo^G#g0Vl}A?ASA!+26O1dG5+ccf6K(oBsG;)-2Td&$)q!YMoP)#WR86Y
zj&StEadOksp-~xQ5kg=eG_6U=G)p@dbN8>^MO$+#Qp%tDR0m3-l)?{u3WWkQ`8)$7
zLy7P6q(@VI0|LmTGkBhdloC(M|I$lw
zG|^Ju$gG-LJSl&sC*6PaFkAQT&M8=5$_9~hh>)E_x7rE(N
z_{2$*=V*;q*r)#f@BcfSc5maifBju#OG?;#_#h8Iw}GzyZgMlz6#Tr&nH+~I1v&`O
z=1EpcMa~N$G{^f+a=h=9z0oO|qO_!x`s!J{`i5IsI)4#$)wPslGS<803we&7I?2EN
z@IN?s@(|Pc84%&?2TB>k*Ty(PGtpX+^3v2+*HBSWj+9||n#t$s?(IdX@H0O2_`}@$
z_IHy`r7jPH2~@zA{o6o?9;LLj6zO>s3I$XU;H6T?Fat&kiIkGL9doFyt!C4f9d!5g
zhBs8%lo(BvFiZ9x+s~)I@VCqq@{H%k?ajw{60KF383+*OrYWMz%)E{a!=5ld>*h~|
zpnyMBV0>zvFFbfZ0BaVlV)5JstXQ}NPf8wr?nxfq^n^|1!w(@qIEJc0o4L}Z-Q{606O)t76!IvOPh`?5
z0^biyXsyvoAtXoVN-KmEbau3Hs=Jrm^pr^sW5cnY#CQg6Zn_i_Wx|ox3L(ODCQ!x*gbVyY
zpmkW*o3Ih(Hd0FC1*P>MI&w7#`(7GtKL
zKnfEUwV+TaB)z2+bocjOJ_a+Dn~JLf0e%qJT*dRG4MSRKTh12pd8Fr|lw#}leH=P^
z)LzIabcFXz#QBB>g#s={6yf&l5sFvn0~Xc`_FWz&`&3=9sMTq8xH>(_&So%8zCT@&7y*Ulu7WV=Y?S}h(1$jB!N%}Kxsuf
zl_t;{>3K{~=i`ugo&n#N6$L6UD>s;wB$e{;1BI7L0jA{iBworxin!?P=^Nm&C!S$&
zcqnm$qni~OAdaC3CmdOiU_3~OdnJ=)VeW)*079IyH~DX}D9w&UnZAd277Vld;z`9#49bl1vG{A0Q9}L73y9Lkvpm%gA6d>2%`hE))tVrSOEb
z{S~whlU)IajvV9hr=B5S$lFp{2pnSw%Mmvzb;E-&p(ctiQb;bUyvyv0;0VF-x8vB--=5PDS2L{(66Nh@sd
z(?s9s-17Lwx;|gTRgvqHVsOvAmPvS|l+Mp28>dlX6y++9J@FJrj-QB7q%b!~rNR&f
zXsytM|2;iDjUV_XhvM=wn3~Eed$XmGuZj#AWqzutnjYc>(A!-{qBpQ=I>N9|}Fb9ggN0cCkq0l__I>cI}
z(@scq%r&<6B#Ih>(dSJh-sT|@wwI*X_~K^cX=3}ABFce6&zjHY$@_lzdJ-zjD=r^{
z5mGY0WuCpLV`Jk8)5k$;#wVwE_Ju8^Qz_#qgbg(=YKtLQ#gRnDMiHU8k=M|%VMKsl
zm}>}I87ZaDNPdNkJeo?B&cXrbLeFQsiSZ7?NszS;KZ7(mnoYj7vboj9Ou+
zAP950AVA6xW*8bC2|cFLcp^t%$#IG3nvQ^F`^uWF3%vaQBoPbo)98TKBKO7WEd2eNXg{bZDK~%
zQk1J{t?g?f1VIq+gGV1@Vsa{MDmvj#3c=|3r0sdtR8)~I$zDbtMhM!PTkUl&_&&ZL
zQ1E^B9XL!^&*{Vysv@i#*BT&W*$OL8*G5Dez6ZUXScnM=$@8>D)B?4;>0!KiWD2DJ9B%Vr_yOxpSCghKVG~(R!G`_FcxOqF1xtx6>h09wpPih!`oQ979%G~ND
zEJq#WSj1i`NW7)SJP5#Hg3m>58by2=h3j0^Tf7)xAnMP9}x$tHJ+AOUo2P@1FxNhxhZG)j0$
z7`EFQA%RlwJ#dIZp+Lbe&^It>9+t4BvV6hf%MZA#EH9@#TTX<&_YVx%1|g=kpU6va
zcq-_k9uNi)lX0mW7esOEATmbjKt6
zZ&9MIv_hM|+qMHRG&0QLV<+$nKI0P;iE!4^(0JYrCUrhMOhauQ$9j(^p86=)2&0j(
z6^x3WMp1;`8lw>!gOJ8$wtXsv2q_b$XJb8rLtW8Du)j+5Bt^)KOEmqo^)$jJk`{6GGB?Q5;03I4m0kJcWD#0
z#*SzWrn>VynVbkPlm+VyZN3xpbI=qe>cVM*hu=4=YSww>@t!w>d11#^PWGR&H#V9z
zirlp@c}Ubx5r_og$~NVq`JD*a)oykXV>3$WMAGTD5#7iZ+L6Nzc8aXX(?uc7zMd}f
zABM#4HW_VyDsq6vb4v4GJjQ*t{VO*_drv8UDO%ge*`4-Imp>!_JY3T)3MS=Pqh1nm-_
z3@hK^mx!V=H%ugjy+JP45_u16oDQy|tzk*i&=jUp5;*)Rr1Y$tD9$CKYEL*7lWk)-
z#u|rSo#63Azi*t6#l1HK^bGXU-QRb4d6*ps_A)s$6;}!tfWtmv+5y;R@;NHYt9a{M
z-$6(FY)aB80u8(O?C0y>xF0_#kj<7AN02iT%~%~Gu0({;wlWd86bWHVUlp|m9FGut
zE0Oqqmzx9N$s|~J7RO_UVx{AmtT%EJY5@#S4zuUb0os~d&wJwgvJB%-=V!S4WA`S>
zNFtW{aM4=_g%3aKiVKW)d+R{V-n836dc%C*DV)e&^KB=CA+YV`NK8F8z>nyz8X7kTP!C>A2L6FP3J}!lm5z`Oooz_q?6D>MWiY
zA+L~1r%9(g8fKO8fp@=^`#%3!)?Igl(aEwKWGpZQdP!LHjOH}sTp*Le=
zx!fcJLt~7LjWaPhPTtH;2Z3@znNDChZu`KU#Kg&g+$=nLpj;#B8tCH4spFSNNZP)C
zk3}+NQsyNlNujxFv!A4`rOsx%T5HB9X4t*|2>T8mMJdIC&Nh}WUBJ?XbNQ=J
ze2hQ*@c+%W9a{{MN+)_xQ7)n_CBfnEl57*8VlGVtDx$#;3beP)JuUAwm-WZIIwT>3-wds@#Wq(Ouol-G$@HjJq-6BbhP;;!v1
zo4@GNHkf>&z`*d31=?`>CW#6|Y5uPd{{eHFWB6N0kLNe<;=X_Q7k=>Y4=FE8p)};C
zeb!xjJ@5IgckqVSy_!$_*++QYTi#42m2t8I?2M0FohWpov+U*}mR><}0^u!hx{Kd=
z-`%vg)|1Jk(JF{3pI0y8b$8xM|KJG!@%_j7)L(y+(rj61n5H5Vkr7ATaEi)zoxCQv
z{VNmlwAP&L>0)X+N7;F#IC#JHrni3lm-z;hxha11>;?u#2a{As=koZ4Jb(B<{*YgL
z?JEtq7GB^jI}h-CzyBc)9X>#5HiMVS;CU&sB^iz$Im*UOo9LK3pLJI+r?h+)&pr2?
ztw4l>@3@(!080Ybmg*@IXmJ#RvZdwx;eY?{{QkY~rg2sUN(+XD#~BLH
zUB8aH+A0U*kZgE*6F++N#}4on{_No3B!B(s&+z6qznRZ{?sN3_cJY<3Kft_s9n5ZN
zvdLwHsA`8k5GAsS(mOfeXaKx`x4z}={MI|(LMH98ar18Oe(&${m9O5<<}EMKH`LFm
zu2Vd>@hSfIKm7x5eA_+jI&j>+#3SMOkTif|=*Ky5u0gQ>rI=RSsHgzJFwV
z^wO@9x(E79IUF(8RNSpKp=VA@BNb((mVxgp&G)|l1L|rk;!ZR%dv2h=AK&*G9UUW=
z*X-G|m&u87Zn)_d{6hW=*;X9PLpk}2w!NNMy;EsT+ni3`d-vPOW+dBpAL5Tb{HOHy
z^~AGCW}2WhTh7WA*HBhk#)m%oaZdFPoA-v>ZeGh(t5${hjV|uC>v;Ud{U68kx+5)N
z$sS(8-hqKj+hB6jxtROmkgcN=1U_wZTCFG12y#=?jEs#TOzc)p!P(eY%{$-qHa_u*
zkMr(#-$QLp1%rb_OiWC%e92;F^7&%u&VlHQ$xhgAvHgrnanlXg)7jp_)O4P&eC?YI
z4)!{^v9^fqAN}!1`0jVV&sV?t4GtdM$KT)g&nQ9#hHOcS54`Wa3=T}hxR2YdHe~Mz
z9JF57dHtx&j~c52=o`Mo4dz(asW22+N`bI5MPYlur>?Hv#`1WYz;{^x@kLfTRKfg*
zfBZZ4@85-xlKf1LLZLuuX%;_-=>c|hB+4NY2(wObYK`MyWNeZ$y{WpxySqIP}|D05*Vc7K}w{@w()C6i8*PNht_t!!7@?JR2t+o-CU
z6-sJ2V<|+PWz5X8cCl8>{EtX_7y&^67A>5IQi@&s4l|V-r>wNXVmT@j6cX^|FMWw1
z2snKB2)lQ0rM#kwLkABtzr6{BCM6X&-Fyp=Jo1CM4dFok@vvTmISEGqfCMgMv7O67
zA0~pxRG@4_Edn}IUWyZ^dSV&?P*G7vM|&I3Z`z2AM0Q*pRK^!jDNX&XT4v3vW$U(`
zl$EB5Wy2g6oJfisio{%Y-1}*2X`r#87NLEPpXwo`({cj%FD~GXOTkMs;O9CCP5CDk+zIT5V9_Wj`hpat#(zeby7wH;K~B9;CoKe&a3%2PZ9L9--&a2uH+Vo0u5qjyrBg2*HL8kCRSkoyy3$A*Q2k(O;D;
zt)nJrDxI-OY9vi&ClpYL`D~45CO=~+pW5+~+A#uU2kGn-LZWXJKPbW`z9dv6OqzT~
z&*`Ev7(WO&-Pgx(jU&K
zic$fS<0H%%wS;AhI+@Dlt+B^Mb(u(>qx+jqWvH7~V+~5m6n*^z&a^_@4NvsFT&jlY
zjN|mg!i3zg`pz@2_rDa9wAMVkbrb*mA75i?W-@NlA*d{?q%@tSzP66a@(Kn@fdd+&Z5yLRnn1Q4DO*BpBkfYw4c4l_woFm7dhN@B$3bxDII=n=*m6m@zD4FgU6nHF5z
}jZ+iCqG`rl
zZi2VH^{vcqZM0R-rmefkPpdP4I#)d_B!U|OMVAWUlxE6EiU?p%#BJp$tc-q=J_+NkX8I~+)!&T=B0!1$G
zbGm1Mf?uGyp@GVZ5ptk-k9*{@oEJD>a1$0*OH>FOWjUGM%KPIaBIiL}znNtDB+6Gftf4$;~C_sHll|MtK)>F8WU
zQ)2^VrCCFd6qIB<>S`)!tgoS>EX&CF6yJOJN#1|&z0Ax^JI%%L1#7FTS-NNee!-7v
z4?reUf&gYWH?ily!C2(b=1I`o+smp|Yv}A~rK+NYn{T|C{RfY+@4!B?*^)%G7#$tw
zwXgYAKKuv2%k1VlOXGa{g>BsbwXYk^7m60!ocZvD9wbSIkLjl2_f?ly@%mfuIIp5Y
zzw~ak=XrGZ_ptro?!@>;eC^`WH{zl^@#K>{^Xz6Oasf59b!5xR&`MDVG{a+4{P2;d
z`OCllpM3MdZ=1D#F)kBTWVxAXYO8Cgs;ab_EEY5b4jw*6?_j?XGfHw$@_vEmHg015
zE7wz1S;nlI3hualJuS_%$>nCKsjj7=zJa+N9sKTZzn|Z~_dPV%SKB+=H#EtIKKgO`
z2YQpgo1;!{t2pg&f{Hwl*5G;4e(%Dz`Mh%NO=extMHZ5Nk#pqaaX$R{zpx$KSd-r=
zZ6l(P(?gk@9Ajz<)?T}cg^QNb*W1k_k3C5`3sn^rXHkYCQH`ZjDORsoL1)KotAjT-
zKFQ&u$Jul601|1b5HU6sw+Yr>w~oL3^N%pMtNZ~ouiFnvQ%J(AG@E`~bxhxhE5>`VmM#a}(`wD*NZFis7ok_n`
zgV9>!2jPtHyZ`)uSeQOeP8}jON%oy(r|Ba8twn$1#5HpHQHNT35uzME%~h3^ELpUG
zo}NCs`+CXyz7_OQ300{xdtI|zX7i~}ew;Nc7EqSWBtmj1^i_lKK5}2
z276=5M1(Lp)nODil?vrmS1w&f+w3Ot1)rIjJQI_X3=fZzo6eziNJ1MP9pOXo`aRaJ
zT61Y3y^~W@42}+Ss^>HV!@~@Y4AC>tcP3p?#oI+(0}{^Ud;E9zXNlDqh$S@yA<;+$
z%9PM?RWdq0&U2eyw25l0;}Q?vTO_w=14>tS7x%pPy}ad3cX9o-SFvK*Qkt9UNvBif
zayj}3N7=q}7td|n%!B{=Ez;=>p7c!bC*}j&4tb!#6X+mhAgVx-N_k|nS<144pdz5D
zp$<9)fgd2HU}S8ZhaZ1}p22>)`+KRYnMLM2>BPKb4-*6dLt`TxIC_Mwdw25u?iZOc
z6a#G}Ej&+J{rx1@4m-3a+|u(5%*Gucjfq#~9~)$4g-^LCCLKW68qon?^;p%Hp5
z001BWNklF+S*Jym7+9TLP<#po+sJ3X&Ze5eU>xRT-VHP*WAcWSFfeH
zzTsup4v;wVYtZn2I
z_x{PtDg*M8Lefk=&z{5k=^pH6!R&d=Z|kJ3sg-$c?bOxGqPD6!A%OgAue^f|o1elD
zg0o*(tu2$@<`f1NFBSqqgj}2cfgzfk8d$z`5p&z;FfuyM)YKHIREnDFN*d~B@$lnM
zGc-CvbyXFe9c^siwI^0=&~Ygq1d6uSR>mhL8Jifx6@S*zXLIyjbV79@+_5A@(z9vp
zE&z?^m7=g5uMZ48UwFNQ-JLWuGsB8`%lL!4e;Ypt=pF3m=&6$&KYfyXq42UVe}Bm#
zX?S#m$>|)mRn=rt>7S~>rnTnbXP@8;-~LKMMJno@BqUZuJaCUFR!@jMQfW5JHCL^q
zrMZ#753CxvG@;_ap(7kRcAR`ZPdb(6SJuCRiODJU>^sQF=%`cj3KlPzM`vd{M~|Ik
z$KE}5-Had^mSR3~Ji8RJ;%s<>L=VZ11nXFZLV6M}3UQ7+TSDa=_}F{@kkw0;NzuIlx0is(kaHqCwS_)O`%?J
zl$cvb7?;8>enetwHd^SVi>SOw&AzY#YhlHbMGOp$aNy8kM#sm(8E)cb6?d#%&pmIv
zi*)MziXpzNd>HpsmRIoZH@%IM|NE5PuUj%G(tBm`*NgCdfL4@>4y9xN_Z`v-?u
zUS7)$ot^ZIk1>%Bc;(e=Sz1?5SyLnb&qM#o(9}5HeLXz3VT`#Q?X)yCP*RdXY0b#!
zI9=Vn9PjF4a&ijkg>q#^3nEmr@?^MC&WnVEB23AIEn$pB>BPD}w>q>W
zQ&m+#`{YQ%QHx=
zDAf|FG+rvCPDq0W$Aj`Rk#L6&C01ULLY}e2D}weDIx>zIif|?=J*q1!nLoFKrp9`z
ztE!mJPvc3+=B>LpcIqU5{H_mj{i>_Ekj^i?HSm-AP|fv?bhdV|WB(pohDU0c?!pII
zG+6I|Hm33jZ`udf-pu;WPJ#dezkrsKAn+km!lyR;n0sIG3YOQ@a>JZCynD@c+}u8o
zw$e&!$|~rannas~(_%X^seBcuT+R%)6C$N_xh*jb!v!1@60kZXUv>BNaQtK!!^5LA
z)ir`oqzn-^
zMhH?ua__pEXeh5h5Hj5G!#@v9O|iA>1n<1+s_=CJpRDK6URBMyjy9II%;pFC_c0w5
z93^oQD&V%e>`HPFD|u?|4%$1KHSUbGEyo5|8JtM0rmIKLnaMhHP`V-pMK%;R|XX~rhUEPkRLX)hA{uLOwXFn?QR4R2Yt
zf~u4k-e4&aL4YjH@}-R%xpDR!7S-07L|H-*gf~!Wb`1^j;O=el5Pp=jy81ALoSlfN
zCyfA6u?}8T5lM#%ibIN$?!HT7U=dXULdDN{EpvI#8-J6|mJUY7#yEWXD9`VFk(1p$
zEa_Z8S?T$ej!#_(Zf^C`6|^)q^H2Z&6?Pon?c6+)I7n22J3FV9Rv{vq3MZI^0k
z10QsGGj
zyN^G@q29x^G&fS3&Ek0;qhsUjKXinlkr8~qVBLJ*#3++~NR-ln51BLsirH1weD-y(
zC+m5j17lPIj9B!ed-hP5DRod+oha|^G-i~nN~c%k=;kRq8?DZ=`dHZANLI_vHcvGv
zFU1>geJ!tl^pb}jb>5>?Ck4fL1h)BQWj84DMf2V87c@6Dlj@=VIy=%
z3LZYNFBS(VT7Ch-cD>zUe&Lk*A)i^t1FlY=sF*jLAbA@Hxwvf+Z(M&TYnH7b<)wb+
z!}N{w&*f?_qLs65nuJQmGUzO^qy^H870p{$rt
z;nX@FV>P{}&~OZxr<5IKi3a2B(ofU$j3ybRkknVz@#-6H^Qgw-w#81d1*P+ZXEHQ5Hn3*p3byXtL4R(XFK&8)53aj`OrQ*$FA-j7ICv5fC{)VhyN3?&r7h2q
zvSTOmEK--kAt}TQGi>)<3Al{GxuZP#=*&*hFAZ{?~b%W0}>IA_-^H$BZ0
zFFwb&9({=O0JuAoi4q#8^x88x4(&a{a2S&zf9cbROnpcNbs#(KoG(m%Fg9
zqP&7d^XIbf;1PE0-V*|7rgF(mPqTdKBIeEQVE@6xsQe6vhWq*a3*kgST6zRZ;hP0s
zCDJpcumJ!AcdZS{T*jGmRz_>iZi}PX)2PIZQ}J#mt>K^l^Q#Pxjqrw7-FZ%g4L9xF
z&Lb~8ZiWM6Aw_ijph?r*1(?p~7#$y@q4vMNw?JM9#^MJ)kH7G2T%lOW)OeTCu6+mD
zzH4`4QI-&boqP8&m76A$Nz>ZWl6cCMhH1YL8VgB&N>Zvd27DYs{3V)_m*nhxn&||5rvQ#$VDk8yFtqi{JTI3jw%WfZRQpHZGgW@1;&0
zTo48`GCs!UJ=?$dPj_Wai>BCU90
z({oJCOcCn>>ZG2G?SsWr7xgtSeT~iqVK4`d9-(`<$0$V^j3cqvTv~KgyYgZhJcnZ<
z9YU3>V<#wT9x7LL!R(t%%zr0CR0#jaVn9xdDWRn&9F?tz6y~ahx{_qwL%sanSN{*)
z{k^|(D;uvuS5ffz#*|@!}3CBzNBYs+T-9dEO01Yt7SJpAS6(XW43R`bY7F
z7Y<85oKmcn8#IeFnM4GbYw3SB_
zmVOa;vg6*;q0>kB)IWWm!zX_+N%+B$VRjwe6YrEF<_yOJ3Nmg&Xw9`t)^O9B>s~sr
zdcF;2YC6a6Bl|dG;hnqq%-oR3FGLH_wbC@!HBw%lC7n)l;?ya7lM+isaaYr5r_+t2
z0UUB{i)W%la><*`p`tyH!mb~bw!w%v%Zfo)8pL6Db`N#)`TM`{3wfH8-KS#|*=~U$
z7AA5eL;y`jFyd2b(UR7=%$sG%clkq5KYC>a)Dk>?Z`E1zyA}ZUhN|lOLssOr&
zyZQY6|L5n~)lz%xWuEhtetp
z$xNxF6b7YMwuz~np}2UsF5ZiDuFmjtOP#@IB@i6TpT#+-)CP7^D{0QNkbvt{cRb2
z&g#OY^A?`F=J173C8-3CAc|Bju|1_|AS}v4s;esT{eUN*d4WPcPY?tIO2s!##|*u|
z590Duy19fIiFFLqhBe2K-7;L;uw}3f=`ijRYL~()V##RhtR^A#L6L~iBhNq1(_3En
zsp5xLb65(8dmN4T#-Jt|+8WxZIERF*3&LQEg@$mHE8}=9ZEhpYe#TvU_p#xrjZ98W
zgu1()j71SKL~i39BIrIE^@xl>8upzl9UrSoh~$Pg9dC(s*9j}-&Q+6C+Kk+g2P#z!$&suk1a1@F!lcxvgy$K8W`as2I3&@~7F2&7{sD7x=szOl5g_{8T%D
znhUABBg)bDAq3O;8AirO!#O2l4ZY&|Wj7&o!xdn258;%k))NV5@3djkC5H3e?~U*q
z?RuAZp}KOGh{i1io!o9>Zg{~j@a@OGZ^_`rr>VZ)E=7&r%PxSoIZX7>$RhcJi`-zU
zDyrfqjD(hP?17AQs&=LaV)WGPBC=jdC4`#Yh2i>)J&$qI(Or1%!ll;Z7=w~hI&xxX
zQZM7BqRCCN35u{ISK`iTTV|dRY(21@r?1i
zHMPvDsx@%D^TDFIBq^MP)m_h5G`>M{aaNoVhTrT-x250!csiEj2vKHfHGU%xC53dV
zHk~ZJoyFR2iR*eGY4q!2FfxXVqebUdAj!$m{_Tg~;dI}brS`1q8a$bbmu=XGBJzl_
zhaDOp4trpiiovAQX>PmrCcCQ$CkL?$8ZLaC{RQpR5)mC^f{DP
z;U-*TXkY))HyIioPF$}{IzwGeZOGbZW|wK025Bo{##
zo}+zgq6@6FlQg)SKHPv#4E7~N4~vsr5d(ZAb^-|J$&HOklSGlN?7j1?(d@PLcIZUU
z%w4%56Q0jSss#@}{Y27WGHGfnt1X)AhHlN7p?BB8y_c53lx0h~<(lhb5KJd87)8r0
z>^N(3{j^PLi`j39ev6Z=T8~2<@}f{ArEpi@CKaPdO2(g!2Sd>owsPG*k~fZ3ulzh^2?K?Xb2(LbNIkHTWWDZ7!1HQ%U7{#-m-YFwN{DISlp#p
z35>+;1H~wCBy*|^kfoirgAG?U+PU#3EVC0TQ^H+rnp`RxExCvcLP$Yrwv3wU>N6Dm
zqKeD~Difu3vY~CCk)EMmPWB`hYgLq&gELs?f_1K7w$_@Fi4mrzbC;IEq*H0$_4>Q0
zsi<{y>$D_LWp=_l9LdJcfVv9x|$kpT6-<)u31e0_QwmvfGWdl5zg)c
zTb>u~Q1Ao)XSW+JHiH4^oYT(RU-M?irH9>Cx1`ZU^Z?M5m1ddK+QhZ1R}rXq%UR3`
z5i!t}Ane7tj6<4pxrF|mE5{al
zl!Pps`J5YfZcgAS)s5@1kh*;j$61|nSJFo3NfU{sf(y;b!zWzE&
zvn6;bFVuL+=VPjl?_vJT7iL~E
zhxmdt7zn2u@3`?+e)G;>rz~3*6P^+xijfNl|D2qfqJMA*DFpL6+7rldyai4>1_+()
zZ7f^7fcc&C60HC&!jNH=6%wV9PIw>ppui}sI`5<%Z
zTZ#6xMQ&RrnDF5ryY}uwX~o=*IaF6xQxtpR*1y3zX16pWrDSqqBFTz#aue-VW#-Rh
zXByjzSU7(!N^AOuMmTirSU8Ahc1|nD7_`%$h%2YKQJV003ET*db)B@Y)79JU>@g4y
zqcDCSX(UNJ&m)_;gbfBD<$0`Hw3I);_m6nro9?Emw$Vl*5#f+NSm6+72o
zvzpRuHpWl1PVj&O6;M%Gj$iN@867*L)QxX!G82fX5E^$^q~_YySK)c#rQEh-SK@OO
zuR4|@-moj2^~4ruauI5`lP}|))w!8zPW5&t(P$O-+GG*`Cgr6lH(TLe@_pd~KeMW9
zdG(FAa^vc2*|KLRkH7E?Lt`V1Pfd`UnW7N*A=;}XhmRhot+kbGX$dQqF5$(k+w4`7
zwt|3~%4$+l5~zUj$%$~~u_sW0POgtdTF@|-CsT=BW>$3#^>wv)p2xw%#~7O!Pb>r#
zxFxx?>Y!F2$x$5_h{Ou%uYtJg9#HWF_97#n43I>){vhxpOP$C%3H*s^0c*REbc`|K8GW@gy2cdy-a
z5}nyCEg%JBBjcnz&*li0j3myWq$82eO^v-rb5kQFnGCt<8IGUoavGEIOGOb$X?=z<
zgla_Y4kZ`qjfeIkmoYdp%)dPNjZpB>>1ipm`yf#j8b`Ng>B@8G`MJoB=Xo^L*3nQ~
zN2mE(b6o>J+V~g%z5RXc-gl7IE0?mMvxCvGaZYrdvK?HsU{-AnDo~71OgVc8M3F$V
z)72Kz&P%x^u~M3jw%HVXpYibtM#o023RN5mqZ~yhQ0(wcL;^K-^Bdvl`w8QXw(j4_
z@$OR`>p5m|phz~&^SoH<)flamg3?TuC6^jS^QE4u%BnOrVf>g?5$L_sO^(HKvPnd)|BA{zg
zJicvSSK{^!VpLhh>xsvv#yHw@yok=~R7UPL81rgtmtR9oWz`jfNf=Cd1+(i~T)c`c
zJ9pCE6YjBIx@aNob7rHJ4);`M!)54&f^VLDWxb2iI>G04iCfa}ZO|IeleEuiC7;hT
zFgV1>*l1$z$el=3&VFp^j)pkEoeH#5^%>5Mj!Ez_Hm8z}P)ov+N#zN%apjujSDmx`
z<&rU&bSlNVRoBM+T_Fep#iniB=^GrTw6uh)SFK>h(xsG@mQs?=kk1zg0v}~YQ`{Al
z;wgkr)%C
zBE@mL$RiTRHt(}=&U~&}w(8vGlb3?QOys7_ERi;Rl<+1M`~n*{ZQF+;d(9PXImqdC=49O`6i*doi
zX#I`dS{IG3IAwpN@#`k_QT0`kE$7{Ddiy!`E*E9e>z*J8cz(xL9)5NM`%fH9NT5+9fkg{C@uWxpz%UaN
z;|Lj!^QJtH^0IQKXQs&)@}{k!ENF<7b~sN6vlq##au!u)A`)hFXrl=)=I$<#2(81V
z^HTCFH>_uVTgSP_$1YZ5nVQb=okxGbgB!jZ_kANI7?&oZ@qM3$#s(_N%TYnV*!Van
zy1MKY59ymG3~0w_Y*-3!a#SI8wzX4HUe5H)4Eqiqw#Kq#;R5D$v{PMG$?2Y6p5C}A
zR--UCSp~`t^+$%Gi*_5hMN~Yli(NDp?{1PYnOuj)B!%GSRqJ@u?RWlCmTz5@2BWp+
z*{v_~&{ID$;YTJKnDO4C@Cm(Y`EusAwW77=_{lDI?A?p+2a%Y!MPH*tQ#f8F(yWVC
z_kp0ZqYb4MU8j2(n-~WHE0!##vm=xv8yFg9^Y$HR;?1^7Mcs2phJ&z@@=X^yvENeY
zWQW_V`LmXMer5`V#G=h&qftsz
zmMx{DZ8lO!di(m>x^q`@y3#W4;svC_%p{Abmm}iNczJ0lwY4<_LBNsY$B_cc%PLs1
za6W+w=<6F~$`77%RPxtq9cBgmxF
zp>l+?=g4KQSw61THgqgoyol1$Ec*`}4*O6-uzKY(lu{J@fM+(nXxj+sbec>mO>TNR
zk)Q@zC6=8=fT9gGb|<&O8NW*GzQ%}t3IVe!Yq|5r^}Obm_2+Ep{*p47g74cbW&Yd_
z+S^(=a^fU`@}0qtaLIaWOEZ2DFf=^M_~e9@b8T*DV9lx(ytr)#{euIE)xZ>0Inms5
zOG6`bTALXe9%c9b0|*%|xTvbEKuXEpeFtq{$dewgxM3YZ5b(_Nn;Gs#sIlCq(iEzc8p
zUMl3vSkWKNf_ZZ(FUvAMJ|2(H3U>j85dkoJ`aCIFIKLB(=0w+Nd_NoxE6rvRQjjkc
zLK>v;RI67kqqe4+-u?kg@OIroG;TGPIBUJ6prN{+C36>X^VMrv
zxoFAHl%2oq4JO*KsD{cDk{0P>UFGIyo|P%R%$A%$fPqb`?bF)4W^;C
zjt{)$xA@|B{*~^YKIXQ~rn;(%(>*;g0k{H3{e{x>U=ANY$=a(|F{`H9iVI;ty-Ye|
z{_e$~oJ`8c3pC|r6)c$F!Q^C)&D(Z{dcDR|Oy#DSoSdSfqKs53WerInLiv#(2>)9v
zt5g;3O|saFcGnxn$nSMa*6@b)chXQ>cg7Cz^K~vtgAqdT%Ij}pc2f%vJ^mw(9XrK>
zc^&li4U)^v0P(!f=+GFWqZ3qAl(J;uLJl4|OiFr;j84$hSWi`XCApbt><`%rC9Gx3
zO3HZUx*NH6=^7@-r>MBOis_k|#5BlMeu|b^%`97-GpMg(Xmo_SSv8cGm4!J?q&V&g
zXPh>P`J}xxQb?-GD|!2C-ptj@R#I_}h)?J2+MkcDvgi7Yj*nTwk}BfHVv@_zsZow}
zonY^QLt#5%dYt7;7ZL=TZQJ+IJj-o#6#xJMSV=@d
zR8JK|j;gHj+yuK1?B}%M-nO^5kV$3e?(S!7a>DBT78`$6MJ1V3nzGVTYOAW5-PCf<
zA^me{Fw;N@m%&j=*-;H;X_ADq%vvXIqGz_&|DuZ=+-XXgXk4ECxs3Neii_bA&ZK}Y
zVBY0mFrmWbnNNA@93tpvyZJ&0yoJTC89Tc
z`2h)h6!z>%
literal 0
HcmV?d00001
diff --git a/doc/exl3.md b/doc/exl3.md
new file mode 100644
index 0000000..76c2b81
--- /dev/null
+++ b/doc/exl3.md
@@ -0,0 +1,77 @@
+# EXL3 quantization
+
+The new **EXL3** format is a variant of [**QTIP**](https://github.com/Cornell-RelaxML/qtip). Like **QTIP** it uses a procedural codebook and encodes high-dimensional vectors into optimal tail-biting trellis structures, but it deviates from **QTIP** in how tensors are regularized and packed. A full description of the format is coming, but until then I refer to the code for the [quantizer](../exllamav3/modules/quant/exl3_lib/quantize.py) and associated [kernels](../exllamav3/exllamav3_ext/quant), the [**QTIP**](https://arxiv.org/abs/2406.11235) and [**QuIP#**](https://arxiv.org/abs/2402.04396) papers, as well as this [excellent writeup](https://www.together.ai/blog/even-better-even-faster-quantized-llms-with-qtip) on **QTIP** from together.ai.
+
+It turns out to be difficult to collect enough examples of models converted with the various SOTA (or SOTA-adjacent) methods. I attribute the lack of options largely to how difficult it is to work with these formats in the first place, hence this project. Following are some benchmarks and comparisons to other formats I was able to find samples of. A couple of notes:
+
+- I have not yet been able to make regular **QTIP** inference work (go figure) but it's probably safe to assume it would match or outperform **EXL3** in accuracy, being largely the same method except with more options.
+- Accounting for quantization of the output layer can make a huge difference in practice, especially for smaller models. So I am including two versions of each perplexity graph, one with bitrate on the horizontal axis, and one that measures the entire VRAM footprint of the weights (not counting the embedding layer which for most inference tasks can be relegated to system RAM.)
+- **GGUF** i-quants are abundant, and it's worth noting that they hold up well in comparison to SOTA formats.
+
+### Perplexity tests
+
+The [eval/compare_q.py](../eval/compare_q.py) script makes an apples-to-apples comparison between formats, measuring perplexity on the wiki2 test set across available bitrates while ensuring that tokenization and scoring remains consistent throughout.
+
+
+
+
+
+
+
+
+
+
+
+
+### HumanEval
+
+For the models tested here, HumanEval scores align closely with results advertised by the publishers or collected from other sources. Some deviation is to be expected due to differences in prompting and sampling, as well as random variation. See the [eval/humaneval.py](../eval/humaneval.py) script for specifics. The occasional bump around 3 bpw is repeatable and statistically significant, likely worth investigating.
+
+
+
+
+
+### Further work
+
+More evaluations are underway (MMLU, MMLU-Pro, etc.), and more models will be tested as architectures are added.
diff --git a/doc/gumbel_eval.png b/doc/gumbel_eval.png
new file mode 100644
index 0000000000000000000000000000000000000000..c5651a9dbf7d3bc6cbf8abbe6779d1f3c5d27726
GIT binary patch
literal 32066
zcmdSBcRZGT7(RTVVN()GHkGV2Br~H(B_$FWrIM`dJu;h0NlI3UQbt0uLUz&AutO@@
zd%wq*=QrN>^M2mn-*2Di>G9m|`?|01I?v-c&f~l;9z3vzg^`Dmq9_(^EsaAIMe9dV
zG_CZ@@QUE8qA>htr?ci!=fn1=o!u-PPf`0VoE>cJoo&vZ5O6)^=ycZJZj01r87UbF
z0V`)`2dACV(zgHe9a8p=XQXdA9(BZ37#y^WoG6OTg8WO9qLO@;qP)|zHFoK{$MiP4
zxgY*AzkFc)-GNu*iT?^SwTx&_BnTL)C1yc+s`}-SnuUht+|)!pY9;Lm}e_Hg{R>Y~gHe!D_`MTiRd)cVqa
zJ@{>>#{cg>the#H2!rsRYZBR`(q=`x5hCp$lXQ~Ccr|Vr=DYU3Pdl!VS1TaCIz+{B
zM{~YgXo