Enable IQ4_NL for KV-cache in token generation using Flash Attention (#99)

* Enable IQ4_NL for V-cache in token generation * We don't need these * Update printour of allowed quantized KV-cache combinations * Add IQ4_NL + IQ4_NL to FA This is a better alternative than Q4_0 + Q4_0 for the VRAM poor. * Remove file added by mistake * Fix typo, which is not really a bug --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-29 19:01:47 +00:00 · 2024-10-21 12:16:54 +02:00
parent d336410509
commit 7c5a91daf1
22 changed files with 214 additions and 37 deletions
--- a/6
+++ b/6
@@ -246,11 +246,11 @@ endif
 # Compile flags
 #

-# keep standard at C11 and C++11
+# keep standard at C11 and C++17
 MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
 MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++17 -fPIC
-MK_NVCCFLAGS = -std=c++11
+MK_NVCCFLAGS = -std=c++17

 ifdef LLAMA_NO_CCACHE
 GGML_NO_CCACHE := 1
@@ -598,6 +598,8 @@ else
 	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
 	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
 	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-iq4_nl.cu))
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*:iq4_nl-iq4_nl.cu))
 endif # GGML_CUDA_FA_ALL_QUANTS

 ifdef GGML_CUDA