Files
ktransformers/archive/csrc/balance_serve/kvc2/CMakeLists.txt
Jiaqi Liao 57d14d22bc Refactor: restructure repository to focus on kt-kernel and KT-SFT modulesq recon (#1581)
* refactor: move legacy code to archive/ directory

  - Moved ktransformers, csrc, third_party, merge_tensors to archive/
  - Moved build scripts and configurations to archive/
  - Kept kt-kernel, KT-SFT, doc, and README files in root
  - Preserved complete git history for all moved files

* refactor: restructure repository to focus on kt-kernel and KT-SFT modules

* fix README

* fix README

* fix README

* fix README

* docs: add performance benchmarks to kt-kernel section

Add comprehensive performance data for kt-kernel to match KT-SFT's presentation:
- AMX kernel optimization: 21.3 TFLOPS (3.9× faster than PyTorch)
- Prefill phase: up to 20× speedup vs baseline
- Decode phase: up to 4× speedup
- NUMA optimization: up to 63% throughput improvement
- Multi-GPU (8×L20): 227.85 tokens/s total throughput with DeepSeek-R1 FP8

Source: https://lmsys.org/blog/2025-10-22-KTransformers/

This provides users with concrete performance metrics for both core modules,
making it easier to understand the capabilities of each component.

* refactor: improve kt-kernel performance data with specific hardware and models

Replace generic performance descriptions with concrete benchmarks:
- Specify exact hardware: 8×L20 GPU + Xeon Gold 6454S, Single/Dual-socket Xeon + AMX
- Include specific models: DeepSeek-R1-0528 (FP8), DeepSeek-V3 (671B)
- Show detailed metrics: total throughput, output throughput, concurrency details
- Match KT-SFT presentation style for consistency

This provides users with actionable performance data they can use to evaluate
hardware requirements and expected performance for their use cases.

* fix README

* docs: clean up performance table and improve formatting

* add pic for README

* refactor: simplify .gitmodules and backup legacy submodules

- Remove 7 legacy submodules from root .gitmodules (archive/third_party/*)
- Keep only 2 active submodules for kt-kernel (llama.cpp, pybind11)
- Backup complete .gitmodules to archive/.gitmodules
- Add documentation in archive/README.md for researchers who need legacy submodules

This reduces initial clone size by ~500MB and avoids downloading unused dependencies.

* refactor: move doc/ back to root directory

Keep documentation in root for easier access and maintenance.

* refactor: consolidate all images to doc/assets/

- Move kt-kernel/assets/heterogeneous_computing.png to doc/assets/
- Remove KT-SFT/assets/ (images already in doc/assets/)
- Update KT-SFT/README.md image references to ../doc/assets/
- Eliminates ~7.9MB image duplication
- Centralizes all documentation assets in one location

* fix pic path for README
2025-11-10 17:42:26 +08:00

160 lines
5.0 KiB
CMake

cmake_minimum_required(VERSION 3.21)
find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 g++ REQUIRED)
set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
project(kvcache-manager VERSION 0.1.0)
set(CMAKE_CXX_STANDARD 20)
# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -Wpedantic -fvisibility=hidden -s")
# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -Wpedantic -g -fsanitize=address")
# set(CMAKE_CXX_FLAGS "-march=native -Wall -Wextra -Wpedantic -g")
# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -g")
# set(CMAKE_BUILD_TYPE "Release")
set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(BUILD_TEST OFF)
set(BUILD_PYTHON_EXT OFF)
if(NOT DEFINED _GLIBCXX_USE_CXX11_ABI)
find_package(Python3 REQUIRED COMPONENTS Interpreter)
execute_process(
COMMAND ${Python3_EXECUTABLE} -c
"import torch; print('1' if torch.compiled_with_cxx11_abi() else '0')"
OUTPUT_VARIABLE ABI_FLAG
OUTPUT_STRIP_TRAILING_WHITESPACE
)
set(_GLIBCXX_USE_CXX11_ABI ${ABI_FLAG} CACHE STRING "C++11 ABI setting from PyTorch" FORCE)
endif()
# 无论是否是自动检测,都传给编译器
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
message(STATUS "_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}")
# set(USE_IO_URING ON)
if(USE_IO_URING)
message(STATUS "Using io_uring")
add_compile_definitions(USE_IO_URING)
else()
message(STATUS "Using aio")
endif()
file(GLOB_RECURSE ALL_SOURCE_FILES src/*.cpp src/*.h test/*.cpp test/*.h test/*.hpp)
# 添加一个自定义目标来格式化所有代码
if(NOT TARGET format)
add_custom_target(
format
COMMAND clang-format
-i
-style=file
${ALL_SOURCE_FILES}
COMMENT "Running clang-format on all source files"
)
endif()
execute_process(
COMMAND python3 -c "import torch; print(torch.__path__[0])"
OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
OUTPUT_STRIP_TRAILING_WHITESPACE
)
message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
# include_directories(/usr/include/tbb)
# link_directories(/usr/lib64)
option(KTRANSFORMERS_USE_NPU "ktransformers: use NPU" OFF)
if(KTRANSFORMERS_USE_NPU)
add_definitions(-DKTRANSFORMERS_USE_NPU=1)
endif()
find_package(TBB REQUIRED)
if(KTRANSFORMERS_USE_NPU)
# NPU 构建
# find_package(CUDA REQUIRED) # NPU 情况不需要 CUDA
else()
# GPU 构建
find_package(CUDA REQUIRED)
endif()
# find_package(prometheus-cpp CONFIG REQUIRED)
if(NOT TARGET prometheus-cpp::pull)
message(FATAL_ERROR "prometheus-cpp::pull not found")
else()
message(STATUS "prometheus Found!")
endif()
if(KTRANSFORMERS_USE_NPU)
# NPU 情况下不检查 CUDA
else()
if(CUDA_FOUND)
message(STATUS "CUDA Found!")
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
message(STATUS "CUDA Toolkit Root: ${CUDA_TOOLKIT_ROOT_DIR}")
else()
message(FATAL_ERROR "CUDA not found!")
endif()
endif()
add_subdirectory(src)
if(BUILD_TEST)
if(KTRANSFORMERS_USE_NPU)
message(STATUS "Build test...")
set(THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party)
add_subdirectory(${THIRD_PARTY_DIR}/spdlog ${CMAKE_CURRENT_SOURCE_DIR}/../build/third_party/spdlog)
add_subdirectory(test)
else()
add_subdirectory(test)
endif()
endif()
message(STATUS "BUILD_PYTHON_EXT: ${BUILD_PYTHON_EXT}")
if(BUILD_PYTHON_EXT)
if(NOT TARGET pybind11::pybind11)
add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
endif()
pybind11_add_module(kvc2_ext src/bind.cpp)
# EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
# define (VERSION_INFO) here.
target_compile_definitions(kvc2_ext PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO})
message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
target_include_directories(kvc2_ext PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
target_link_libraries(kvc2_ext PUBLIC kvc2 async_store)
install(TARGETS kvc2_ext LIBRARY
DESTINATION ${CMAKE_BINARY_DIR}/output)
install(FILES src/kvc2_utils.py
DESTINATION ${CMAKE_BINARY_DIR}/output)
endif()
if(USE_IO_URING)
set(PHOTON_ENABLE_URING ON CACHE BOOL "Enable io_uring")
endif()
set(PHOTON_CXX_STANDARD 14 CACHE INTERNAL "C++ standard")
if(KTRANSFORMERS_USE_NPU)
set(CMAKE_CXX_FLAGS "-O3 -march=armv8.2-a")
else()
set(CMAKE_CXX_FLAGS "-O3 -march=native")
endif()
message(STATUS "CMAKE_CXX_FLAGS of PhotonLibOS: ${CMAKE_CXX_FLAGS}")
add_subdirectory(${THIRD_PARTY_DIR}/PhotonLibOS ${THIRD_PARTY_BUILD_DIR}/PhotonLibOS)