mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-05-01 03:31:15 +00:00
* refactor: move legacy code to archive/ directory - Moved ktransformers, csrc, third_party, merge_tensors to archive/ - Moved build scripts and configurations to archive/ - Kept kt-kernel, KT-SFT, doc, and README files in root - Preserved complete git history for all moved files * refactor: restructure repository to focus on kt-kernel and KT-SFT modules * fix README * fix README * fix README * fix README * docs: add performance benchmarks to kt-kernel section Add comprehensive performance data for kt-kernel to match KT-SFT's presentation: - AMX kernel optimization: 21.3 TFLOPS (3.9× faster than PyTorch) - Prefill phase: up to 20× speedup vs baseline - Decode phase: up to 4× speedup - NUMA optimization: up to 63% throughput improvement - Multi-GPU (8×L20): 227.85 tokens/s total throughput with DeepSeek-R1 FP8 Source: https://lmsys.org/blog/2025-10-22-KTransformers/ This provides users with concrete performance metrics for both core modules, making it easier to understand the capabilities of each component. * refactor: improve kt-kernel performance data with specific hardware and models Replace generic performance descriptions with concrete benchmarks: - Specify exact hardware: 8×L20 GPU + Xeon Gold 6454S, Single/Dual-socket Xeon + AMX - Include specific models: DeepSeek-R1-0528 (FP8), DeepSeek-V3 (671B) - Show detailed metrics: total throughput, output throughput, concurrency details - Match KT-SFT presentation style for consistency This provides users with actionable performance data they can use to evaluate hardware requirements and expected performance for their use cases. * fix README * docs: clean up performance table and improve formatting * add pic for README * refactor: simplify .gitmodules and backup legacy submodules - Remove 7 legacy submodules from root .gitmodules (archive/third_party/*) - Keep only 2 active submodules for kt-kernel (llama.cpp, pybind11) - Backup complete .gitmodules to archive/.gitmodules - Add documentation in archive/README.md for researchers who need legacy submodules This reduces initial clone size by ~500MB and avoids downloading unused dependencies. * refactor: move doc/ back to root directory Keep documentation in root for easier access and maintenance. * refactor: consolidate all images to doc/assets/ - Move kt-kernel/assets/heterogeneous_computing.png to doc/assets/ - Remove KT-SFT/assets/ (images already in doc/assets/) - Update KT-SFT/README.md image references to ../doc/assets/ - Eliminates ~7.9MB image duplication - Centralizes all documentation assets in one location * fix pic path for README
160 lines
5.0 KiB
CMake
160 lines
5.0 KiB
CMake
cmake_minimum_required(VERSION 3.21)
|
|
|
|
find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 g++ REQUIRED)
|
|
set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
|
|
|
|
project(kvcache-manager VERSION 0.1.0)
|
|
|
|
set(CMAKE_CXX_STANDARD 20)
|
|
|
|
# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -Wpedantic -fvisibility=hidden -s")
|
|
# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -Wpedantic -g -fsanitize=address")
|
|
# set(CMAKE_CXX_FLAGS "-march=native -Wall -Wextra -Wpedantic -g")
|
|
# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -g")
|
|
# set(CMAKE_BUILD_TYPE "Release")
|
|
set(CMAKE_BUILD_TYPE "Debug")
|
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
|
set(BUILD_TEST OFF)
|
|
set(BUILD_PYTHON_EXT OFF)
|
|
|
|
if(NOT DEFINED _GLIBCXX_USE_CXX11_ABI)
|
|
find_package(Python3 REQUIRED COMPONENTS Interpreter)
|
|
|
|
execute_process(
|
|
COMMAND ${Python3_EXECUTABLE} -c
|
|
"import torch; print('1' if torch.compiled_with_cxx11_abi() else '0')"
|
|
OUTPUT_VARIABLE ABI_FLAG
|
|
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
)
|
|
|
|
set(_GLIBCXX_USE_CXX11_ABI ${ABI_FLAG} CACHE STRING "C++11 ABI setting from PyTorch" FORCE)
|
|
endif()
|
|
|
|
# 无论是否是自动检测,都传给编译器
|
|
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
|
|
|
|
message(STATUS "_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}")
|
|
|
|
# set(USE_IO_URING ON)
|
|
if(USE_IO_URING)
|
|
message(STATUS "Using io_uring")
|
|
add_compile_definitions(USE_IO_URING)
|
|
else()
|
|
message(STATUS "Using aio")
|
|
endif()
|
|
|
|
file(GLOB_RECURSE ALL_SOURCE_FILES src/*.cpp src/*.h test/*.cpp test/*.h test/*.hpp)
|
|
|
|
# 添加一个自定义目标来格式化所有代码
|
|
if(NOT TARGET format)
|
|
add_custom_target(
|
|
format
|
|
COMMAND clang-format
|
|
-i
|
|
-style=file
|
|
${ALL_SOURCE_FILES}
|
|
COMMENT "Running clang-format on all source files"
|
|
)
|
|
endif()
|
|
|
|
execute_process(
|
|
COMMAND python3 -c "import torch; print(torch.__path__[0])"
|
|
OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
|
|
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
)
|
|
|
|
message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
|
|
|
|
# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
|
|
find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
|
|
find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
|
|
|
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
|
|
|
|
# include_directories(/usr/include/tbb)
|
|
# link_directories(/usr/lib64)
|
|
option(KTRANSFORMERS_USE_NPU "ktransformers: use NPU" OFF)
|
|
if(KTRANSFORMERS_USE_NPU)
|
|
add_definitions(-DKTRANSFORMERS_USE_NPU=1)
|
|
endif()
|
|
find_package(TBB REQUIRED)
|
|
|
|
if(KTRANSFORMERS_USE_NPU)
|
|
# NPU 构建
|
|
# find_package(CUDA REQUIRED) # NPU 情况不需要 CUDA
|
|
else()
|
|
# GPU 构建
|
|
find_package(CUDA REQUIRED)
|
|
endif()
|
|
|
|
# find_package(prometheus-cpp CONFIG REQUIRED)
|
|
if(NOT TARGET prometheus-cpp::pull)
|
|
message(FATAL_ERROR "prometheus-cpp::pull not found")
|
|
else()
|
|
message(STATUS "prometheus Found!")
|
|
endif()
|
|
|
|
if(KTRANSFORMERS_USE_NPU)
|
|
# NPU 情况下不检查 CUDA
|
|
else()
|
|
if(CUDA_FOUND)
|
|
message(STATUS "CUDA Found!")
|
|
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
|
|
message(STATUS "CUDA Toolkit Root: ${CUDA_TOOLKIT_ROOT_DIR}")
|
|
else()
|
|
message(FATAL_ERROR "CUDA not found!")
|
|
endif()
|
|
endif()
|
|
|
|
add_subdirectory(src)
|
|
|
|
if(BUILD_TEST)
|
|
if(KTRANSFORMERS_USE_NPU)
|
|
message(STATUS "Build test...")
|
|
set(THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party)
|
|
add_subdirectory(${THIRD_PARTY_DIR}/spdlog ${CMAKE_CURRENT_SOURCE_DIR}/../build/third_party/spdlog)
|
|
add_subdirectory(test)
|
|
else()
|
|
add_subdirectory(test)
|
|
endif()
|
|
endif()
|
|
|
|
message(STATUS "BUILD_PYTHON_EXT: ${BUILD_PYTHON_EXT}")
|
|
|
|
if(BUILD_PYTHON_EXT)
|
|
if(NOT TARGET pybind11::pybind11)
|
|
add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
|
|
endif()
|
|
|
|
pybind11_add_module(kvc2_ext src/bind.cpp)
|
|
|
|
# EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
|
|
# define (VERSION_INFO) here.
|
|
target_compile_definitions(kvc2_ext PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO})
|
|
message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
|
|
target_include_directories(kvc2_ext PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
|
|
|
|
target_link_libraries(kvc2_ext PUBLIC kvc2 async_store)
|
|
|
|
install(TARGETS kvc2_ext LIBRARY
|
|
DESTINATION ${CMAKE_BINARY_DIR}/output)
|
|
install(FILES src/kvc2_utils.py
|
|
DESTINATION ${CMAKE_BINARY_DIR}/output)
|
|
endif()
|
|
|
|
if(USE_IO_URING)
|
|
set(PHOTON_ENABLE_URING ON CACHE BOOL "Enable io_uring")
|
|
endif()
|
|
|
|
set(PHOTON_CXX_STANDARD 14 CACHE INTERNAL "C++ standard")
|
|
|
|
if(KTRANSFORMERS_USE_NPU)
|
|
set(CMAKE_CXX_FLAGS "-O3 -march=armv8.2-a")
|
|
else()
|
|
set(CMAKE_CXX_FLAGS "-O3 -march=native")
|
|
endif()
|
|
|
|
message(STATUS "CMAKE_CXX_FLAGS of PhotonLibOS: ${CMAKE_CXX_FLAGS}")
|
|
|
|
add_subdirectory(${THIRD_PARTY_DIR}/PhotonLibOS ${THIRD_PARTY_BUILD_DIR}/PhotonLibOS)
|