diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 88ee35ea..095cdac9 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -135,6 +135,9 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ON) set (GGML_CUDA_FUSION "1" CACHE STRING "ggml: enable/disable fusion") +set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING + "ggml: cuda link binary compression mode; requires cuda 12.8+") +set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size") option(GGML_IQK_FLASH_ATTENTION "ggml: enable the IQK FlashAttention CPU kernels" ON) option(GGML_IQK_FA_ALL_QUANTS "ggml: compile all quants for IQK FlashAttention" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index e6244bb5..818bafa2 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1171,7 +1171,17 @@ set(CUDA_CXX_FLAGS "") if (GGML_CUDA) set(CUDA_FLAGS -use_fast_math -extended-lambda -lineinfo) + message("-- CUDA Toolkit: ${CUDAToolkit_VERSION}") + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") + # Options are: + # - none (not recommended) + # - speed (nvcc's default) + # - balance + # - size + list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE}) + endif() + if (GGML_FATAL_WARNINGS) list(APPEND CUDA_FLAGS -Werror all-warnings) endif()