mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-29 19:01:47 +00:00
Enable IQ4_NL for KV-cache in token generation using Flash Attention (#99)
* Enable IQ4_NL for V-cache in token generation * We don't need these * Update printour of allowed quantized KV-cache combinations * Add IQ4_NL + IQ4_NL to FA This is a better alternative than Q4_0 + Q4_0 for the VRAM poor. * Remove file added by mistake * Fix typo, which is not really a bug --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
6
Makefile
6
Makefile
@@ -246,11 +246,11 @@ endif
|
||||
# Compile flags
|
||||
#
|
||||
|
||||
# keep standard at C11 and C++11
|
||||
# keep standard at C11 and C++17
|
||||
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
|
||||
MK_CFLAGS = -std=c11 -fPIC
|
||||
MK_CXXFLAGS = -std=c++17 -fPIC
|
||||
MK_NVCCFLAGS = -std=c++11
|
||||
MK_NVCCFLAGS = -std=c++17
|
||||
|
||||
ifdef LLAMA_NO_CCACHE
|
||||
GGML_NO_CCACHE := 1
|
||||
@@ -598,6 +598,8 @@ else
|
||||
OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
|
||||
OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
|
||||
OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
|
||||
OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-iq4_nl.cu))
|
||||
OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*:iq4_nl-iq4_nl.cu))
|
||||
endif # GGML_CUDA_FA_ALL_QUANTS
|
||||
|
||||
ifdef GGML_CUDA
|
||||
|
||||
Reference in New Issue
Block a user