mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 11:21:56 +00:00
CUDA: compress-mode size (#1110)
Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
@@ -135,6 +135,9 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
|||||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||||
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ON)
|
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ON)
|
||||||
set (GGML_CUDA_FUSION "1" CACHE STRING "ggml: enable/disable fusion")
|
set (GGML_CUDA_FUSION "1" CACHE STRING "ggml: enable/disable fusion")
|
||||||
|
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
|
||||||
|
"ggml: cuda link binary compression mode; requires cuda 12.8+")
|
||||||
|
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
|
||||||
|
|
||||||
option(GGML_IQK_FLASH_ATTENTION "ggml: enable the IQK FlashAttention CPU kernels" ON)
|
option(GGML_IQK_FLASH_ATTENTION "ggml: enable the IQK FlashAttention CPU kernels" ON)
|
||||||
option(GGML_IQK_FA_ALL_QUANTS "ggml: compile all quants for IQK FlashAttention" OFF)
|
option(GGML_IQK_FA_ALL_QUANTS "ggml: compile all quants for IQK FlashAttention" OFF)
|
||||||
|
|||||||
@@ -1171,7 +1171,17 @@ set(CUDA_CXX_FLAGS "")
|
|||||||
|
|
||||||
if (GGML_CUDA)
|
if (GGML_CUDA)
|
||||||
set(CUDA_FLAGS -use_fast_math -extended-lambda -lineinfo)
|
set(CUDA_FLAGS -use_fast_math -extended-lambda -lineinfo)
|
||||||
|
message("-- CUDA Toolkit: ${CUDAToolkit_VERSION}")
|
||||||
|
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||||
|
# Options are:
|
||||||
|
# - none (not recommended)
|
||||||
|
# - speed (nvcc's default)
|
||||||
|
# - balance
|
||||||
|
# - size
|
||||||
|
|
||||||
|
list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
|
||||||
|
endif()
|
||||||
|
|
||||||
if (GGML_FATAL_WARNINGS)
|
if (GGML_FATAL_WARNINGS)
|
||||||
list(APPEND CUDA_FLAGS -Werror all-warnings)
|
list(APPEND CUDA_FLAGS -Werror all-warnings)
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
Reference in New Issue
Block a user