From aef6672dd84024aef8a1355306e3b6a7a0bf03fc Mon Sep 17 00:00:00 2001 From: ZiWei Yuan Date: Sat, 15 Nov 2025 18:26:49 +0800 Subject: [PATCH] [docs]: add contribuing guide and add hooks install (#1613) * [feat]: update kt-kernel hooks and add contribution guide * [docs]: add contributing guide * [style]: format the python file and cpp file in kt-kernel --- .github/CONTRIBUTING.md | 139 ++++++++++++++++++ kt-kernel/.githooks/pre-commit | 37 +++-- kt-kernel/CMakeLists.txt | 45 ++++-- kt-kernel/cpu_backend/shared_mem_buffer.cpp | 2 +- kt-kernel/operators/moe-tp.hpp | 1 + kt-kernel/operators/moe_kernel/moe.hpp | 2 +- kt-kernel/scripts/convert_cpu_weights.py | 67 +++++---- kt-kernel/scripts/convert_gpu_weights.py | 108 +++++--------- .../convert_kimi_k2_fp8_to_bf16_cpu.py | 15 +- kt-kernel/scripts/convert_moe_to_bf16.py | 12 +- kt-kernel/scripts/install-git-hooks.sh | 25 ++-- 11 files changed, 289 insertions(+), 164 deletions(-) create mode 100644 .github/CONTRIBUTING.md diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000..c16debf --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,139 @@ +## Before Commit! + +Your commit message must follow Conventional Commits (https://www.conventionalcommits.org/) and your code should be formatted. The Git hooks will do most of the work automatically: + +### Tool Requirements + +You need a recent `clang-format` (>= 18). In a conda environment you can install: + +```shell +conda install -c conda-forge clang-format=18 +``` + +If you previously configured with an older version, remove the build directory and reconfigure: + +```shell +rm -rf kt-kernel/build +``` + +Install `black` for Python formatting: + +```shell +conda install black +``` + +### Install hook: +```shell +bash kt-kernel/scripts/install-git-hooks.sh +#or just cmake the kt-kernel +cmake -S kt-kernel -B kt-kernel/build +``` + +There are manual commands if you need format. + +```shell +cmake -S kt-kernel -B kt-kernel/build +cmake --build kt-kernel/build --target format +``` + +## Developer Note + +Formatting and commit message rules are enforced by Git hooks. After installing `clang-format` and `black`, just commit normally—the hooks will run formatting for you. + +> [!NOTE] +> If formatting modifies files, the commit is aborted after staging those changes. Review them and run `git commit` again. Repeat until no further formatting changes appear. + +--- + +### Conventional Commit Regex (Reference) + +The commit-msg hook enforces this pattern: + +```text +regex='^\[(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip)\](\([^\)]+\))?(!)?: .+' +``` + +Meaning (English): +* `[type]` required — one of feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip +* Optional scope: `(scope)` — any chars except `)` +* Optional breaking change marker: `!` right after type or scope +* Separator: `: ` (colon + space) +* Subject: free text (at least one character) + +Examples: +```text +[feat]: add adaptive batching +[fix(parser)]: handle empty token list +[docs]!: update API section for breaking rename +``` + +You can bypass locally (not recommended) with: +```shell +git commit --no-verify +``` +## 提交前提醒 + +提交信息必须满足 Conventional Commits 规范 (https://www.conventionalcommits.org/),代码需要符合格式要求。Git 钩子已经集成了大部分工作: +### 软件要求 + +需要较新的 `clang-format` (>= 18),在 conda 环境中安装: + +```shell +conda install -c conda-forge clang-format=18 +``` + +如果之前用老版本配置过,请删除构建目录重新配置: + +```shell +rm -rf kt-kernel/build +``` + +安装 `black` 以进行 Python 文件格式化: + +```shell +conda install black +``` +### 安装钩子 +```shell +bash kt-kernel/scripts/install-git-hooks.sh +#or just cmake the kt-kernel +cmake -S kt-kernel -B kt-kernel/build +``` +如果你需要手动格式化: +```shell +cmake -S kt-kernel -B kt-kernel/build +cmake --build kt-kernel/build --target format +``` + +## 开发者说明 + +本仓库通过 Git hooks 自动执行代码格式化与提交信息规范检查。只需安装好 `clang-format` 与 `black` 后正常执行提交即可,钩子会自动格式化。 + +> [!NOTE] +> 如果格式化修改了文件,钩子会终止提交并已暂存这些改动。请查看修改后再次执行 `git commit`,重复直到没有新的格式化变更。 + +### 提交信息正则(参考) + +钩子使用如下正则检查提交信息: +```text +regex='^\[(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip)\](\([^\)]+\))?(!)?: .+' +``` +含义: +* `[type]` 必填:feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip +* 作用域可选:`(scope)`,不能包含右括号 +* 可选的破坏性标记:`!` +* 分隔符:冒号+空格 `: ` +* 描述:至少一个字符 + +示例: +```text +[feat]: 增加自适应 batch 功能 +[fix(tokenizer)]: 修复空 token 列表处理 +[docs]!: 更新接口文档(存在破坏性修改) +``` + +跳过钩子(不推荐,仅紧急时): +```shell +git commit --no-verify +``` + diff --git a/kt-kernel/.githooks/pre-commit b/kt-kernel/.githooks/pre-commit index bdebb12..e3d42e0 100755 --- a/kt-kernel/.githooks/pre-commit +++ b/kt-kernel/.githooks/pre-commit @@ -1,10 +1,12 @@ #!/usr/bin/bash -# Pre-commit hook: run clang-format via CMake 'format' target and Black for Python before allowing commit. -# If formatting makes changes, stage them and abort so user can review. +# Pre-commit hook: run clang-format via kt-kernel's CMake 'format' target and Black for Python +# before allowing commit. If formatting makes changes, stage them and abort so user can review. set -euo pipefail REPO_ROOT="$(git rev-parse --show-toplevel)" -BUILD_DIR="$REPO_ROOT/build" +# kt-kernel project directory within the monorepo +KERNEL_DIR="$REPO_ROOT/kt-kernel" +BUILD_DIR="$KERNEL_DIR/build" FORMAT_TARGET="format" CLANG_FORMAT_BIN="${CLANG_FORMAT_BIN:-clang-format}" BLACK_BIN="${BLACK_BIN:-black}" @@ -20,10 +22,10 @@ if ! command -v "$BLACK_BIN" >/dev/null 2>&1; then echo "[pre-commit] black not found (looked for $BLACK_BIN). Skipping Python format." >&2 fi -# Configure build directory if missing (quiet) -if [ ! -d "$BUILD_DIR" ] || [ ! -f "$BUILD_DIR/Makefile" ] && [ ! -f "$BUILD_DIR/build.ninja" ]; then - echo "[pre-commit] configuring project (cmake) ..." >&2 - cmake -S "$REPO_ROOT" -B "$BUILD_DIR" >/dev/null +# Configure kt-kernel build directory if missing (quiet) +if [ ! -d "$BUILD_DIR" ] || { [ ! -f "$BUILD_DIR/Makefile" ] && [ ! -f "$BUILD_DIR/build.ninja" ]; }; then + echo "[pre-commit] configuring kt-kernel (cmake) ..." >&2 + cmake -S "$KERNEL_DIR" -B "$BUILD_DIR" >/dev/null fi # Run format target (prefer ninja if present) @@ -38,15 +40,18 @@ fi # Run black on staged python files (or entire repo if you prefer) if command -v "$BLACK_BIN" >/dev/null 2>&1; then - # Get staged python files; if none, skip - PY_FILES=$(git diff --cached --name-only --diff-filter=ACM | grep -E '\.py$' || true) - if [ -n "$PY_FILES" ]; then - echo "[pre-commit] running black on staged python files..." >&2 - $BLACK_BIN $PY_FILES - else - # Optionally format all python files; comment out if not desired - # $BLACK_BIN "$REPO_ROOT" - : + # Run black only on kt-kernel's python and scripts directories + BLACK_PATHS="" + if [ -d "$KERNEL_DIR/python" ]; then + BLACK_PATHS="$BLACK_PATHS $KERNEL_DIR/python" + fi + if [ -d "$KERNEL_DIR/scripts" ]; then + BLACK_PATHS="$BLACK_PATHS $KERNEL_DIR/scripts" + fi + if [ -n "$BLACK_PATHS" ]; then + echo "[pre-commit] running black on:$BLACK_PATHS" >&2 + # shellcheck disable=SC2086 + $BLACK_BIN $BLACK_PATHS fi fi diff --git a/kt-kernel/CMakeLists.txt b/kt-kernel/CMakeLists.txt index 24f092d..6429b7b 100644 --- a/kt-kernel/CMakeLists.txt +++ b/kt-kernel/CMakeLists.txt @@ -79,27 +79,40 @@ if(USE_CONDA_TOOLCHAIN) set(CMAKE_INSTALL_RPATH_USE_LINK_PATH OFF) endif() -## Ensure git hooks are installed when configuring the project -# If this is a git working copy and the installer exists, run it and fail the CMake configure -# when installation fails. If no .git directory is present (e.g. source tarball), skip. -if(EXISTS "${CMAKE_SOURCE_DIR}/.git" AND IS_DIRECTORY "${CMAKE_SOURCE_DIR}/.git") - if(EXISTS "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh") - message(STATUS "Detected .git; installing git hooks using scripts/install-git-hooks.sh") - execute_process( - COMMAND sh "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh" - WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" - RESULT_VARIABLE _INSTALL_GIT_HOOKS_RESULT - OUTPUT_VARIABLE _INSTALL_GIT_HOOKS_OUT - ERROR_VARIABLE _INSTALL_GIT_HOOKS_ERR +## Ensure git hooks are installed when configuring the project (monorepo-aware) +# If we are inside a git worktree (repo root is outside kt-kernel now), invoke the installer +# which will link kt-kernel/.githooks into the top-level .git/hooks. Otherwise, skip. +find_program(GIT_BIN git) +if(GIT_BIN) + execute_process( + COMMAND "${GIT_BIN}" rev-parse --show-toplevel + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE _GIT_TOP + RESULT_VARIABLE _GIT_RV + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + ) + if(_GIT_RV EQUAL 0 AND EXISTS "${_GIT_TOP}/.git" AND IS_DIRECTORY "${_GIT_TOP}/.git") + if(EXISTS "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh") + message(STATUS "Detected git worktree at ${_GIT_TOP}; installing hooks from kt-kernel/.githooks") + execute_process( + COMMAND sh "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh" + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + RESULT_VARIABLE _INSTALL_GIT_HOOKS_RESULT + OUTPUT_VARIABLE _INSTALL_GIT_HOOKS_OUT + ERROR_VARIABLE _INSTALL_GIT_HOOKS_ERR ) - if(NOT _INSTALL_GIT_HOOKS_RESULT EQUAL 0) - message(FATAL_ERROR "Installing git hooks failed (exit ${_INSTALL_GIT_HOOKS_RESULT}).\nOutput:\n${_INSTALL_GIT_HOOKS_OUT}\nError:\n${_INSTALL_GIT_HOOKS_ERR}") + if(NOT _INSTALL_GIT_HOOKS_RESULT EQUAL 0) + message(FATAL_ERROR "Installing git hooks failed (exit ${_INSTALL_GIT_HOOKS_RESULT}).\nOutput:\n${_INSTALL_GIT_HOOKS_OUT}\nError:\n${_INSTALL_GIT_HOOKS_ERR}") + endif() + else() + message(FATAL_ERROR "Required script 'scripts/install-git-hooks.sh' not found in kt-kernel; cannot install hooks.") endif() else() - message(FATAL_ERROR "Repository appears to be a git repo but required script 'scripts/install-git-hooks.sh' was not found. Please ensure hooks installer is present.") + message(STATUS "No git worktree detected; skipping git hooks installation") endif() else() - message(STATUS "No .git directory found; skipping git hooks installation") + message(STATUS "git not found; skipping git hooks installation") endif() set(CMAKE_CXX_STANDARD 20) diff --git a/kt-kernel/cpu_backend/shared_mem_buffer.cpp b/kt-kernel/cpu_backend/shared_mem_buffer.cpp index c6b04d0..4d74ce1 100644 --- a/kt-kernel/cpu_backend/shared_mem_buffer.cpp +++ b/kt-kernel/cpu_backend/shared_mem_buffer.cpp @@ -9,10 +9,10 @@ **/ #include "shared_mem_buffer.h" +#include #include #include -#include size_t MemoryRequest::total_size() { size_t total = 0; diff --git a/kt-kernel/operators/moe-tp.hpp b/kt-kernel/operators/moe-tp.hpp index 7e400b2..7130715 100644 --- a/kt-kernel/operators/moe-tp.hpp +++ b/kt-kernel/operators/moe-tp.hpp @@ -7,6 +7,7 @@ #include #include +#include "../cpu_backend/shared_mem_buffer.h" #include "common.hpp" // Forward declaration for Llamafile backend type checking diff --git a/kt-kernel/operators/moe_kernel/moe.hpp b/kt-kernel/operators/moe_kernel/moe.hpp index 59e58d0..c5d3acb 100644 --- a/kt-kernel/operators/moe_kernel/moe.hpp +++ b/kt-kernel/operators/moe_kernel/moe.hpp @@ -13,11 +13,11 @@ #include #include "../common.hpp" +#include "../cpu_backend/shared_mem_buffer.h" #include "../moe-tp.hpp" #include "api/common.h" #include "api/mat_kernel.h" #include "llama.cpp/ggml.h" - template class MOE_KERNEL_TP #ifdef FORWARD_TIME_PROFILE diff --git a/kt-kernel/scripts/convert_cpu_weights.py b/kt-kernel/scripts/convert_cpu_weights.py index 520e873..92f3a44 100644 --- a/kt-kernel/scripts/convert_cpu_weights.py +++ b/kt-kernel/scripts/convert_cpu_weights.py @@ -22,7 +22,6 @@ import triton import triton.language as tl - Q_BITS = 4 STORAGE_BITS = 32 PACK_NUM = STORAGE_BITS // Q_BITS @@ -31,6 +30,7 @@ NUMA_NUM = 2 REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + @triton.jit def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr): pid_m = tl.program_id(axis=0) @@ -51,10 +51,11 @@ def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> t assert x.dim() == 2 and s.dim() == 2 M, N = x.size() y = torch.empty_like(x, dtype=torch.get_default_dtype()) - grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE'])) + grid = lambda meta: (triton.cdiv(M, meta["BLOCK_SIZE"]), triton.cdiv(N, meta["BLOCK_SIZE"])) weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size) return y + def load_model_config(input_path: str, input_type: str = None) -> Dict: """Load model configuration from config.json @@ -297,7 +298,6 @@ class ConverterBase: handle = self.file_handle_map[file] return handle.get_tensor(key) - # layers_id -> list[experts_id] def _find_expert_layers(self) -> Dict[int, List[int]]: """Find all layers and experts in the model""" @@ -517,7 +517,9 @@ class OnlineQuantConverter(ConverterBase): quant_method: str = "int4", merge_to_safetensor: bool = True, ): - super().__init__(input_path, output_path, model_config, cpuinfer_threads, threadpool_count, input_type, merge_to_safetensor) + super().__init__( + input_path, output_path, model_config, cpuinfer_threads, threadpool_count, input_type, merge_to_safetensor + ) self.quant_method = quant_method # For FP8, get block size from model_config @@ -569,11 +571,11 @@ class OnlineQuantConverter(ConverterBase): if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: binary_data = f.read() # Determine dtype based on file name - if 'scale' in file_path: + if "scale" in file_path: # Scale tensors are typically float32 np_array = np.frombuffer(binary_data, dtype=np.float32) else: @@ -616,22 +618,12 @@ class OnlineQuantConverter(ConverterBase): # Iterate through all experts for expert_id in range(self.num_experts): # For each projection (down, gate, up) - proj_mappings = [ - ('down', 'ffn_down_exps'), - ('gate', 'ffn_gate_exps'), - ('up', 'ffn_up_exps') - ] + proj_mappings = [("down", "ffn_down_exps"), ("gate", "ffn_gate_exps"), ("up", "ffn_up_exps")] for proj_name, proj_key in proj_mappings: # Build file patterns - quant_pattern = os.path.join( - numa_folder, - f'{amx_method}_{proj_name}_{expert_id}_*Byte_quant_.kt' - ) - scale_pattern = os.path.join( - numa_folder, - f'{amx_method}_{proj_name}_{expert_id}_*Byte_scale_.kt' - ) + quant_pattern = os.path.join(numa_folder, f"{amx_method}_{proj_name}_{expert_id}_*Byte_quant_.kt") + scale_pattern = os.path.join(numa_folder, f"{amx_method}_{proj_name}_{expert_id}_*Byte_scale_.kt") # Find files using glob quant_files = glob.glob(quant_pattern) @@ -705,18 +697,18 @@ class OnlineQuantConverter(ConverterBase): raise KeyError(f"Missing down weight_scale_inv for layer {layer_idx}, expert {expert_id}") # Load FP8 weights and scales - gate_fp8 = self._load_tensor(gate_key).to('cuda') - up_fp8 = self._load_tensor(up_key).to('cuda') - down_fp8 = self._load_tensor(down_key).to('cuda') + gate_fp8 = self._load_tensor(gate_key).to("cuda") + up_fp8 = self._load_tensor(up_key).to("cuda") + down_fp8 = self._load_tensor(down_key).to("cuda") - gate_scale_inv = self._load_tensor(gate_scale_key).to('cuda') - up_scale_inv = self._load_tensor(up_scale_key).to('cuda') - down_scale_inv = self._load_tensor(down_scale_key).to('cuda') + gate_scale_inv = self._load_tensor(gate_scale_key).to("cuda") + up_scale_inv = self._load_tensor(up_scale_key).to("cuda") + down_scale_inv = self._load_tensor(down_scale_key).to("cuda") # Dequantize FP8 to BF16 using block-wise scaling - gate_weight = weight_dequant(gate_fp8, gate_scale_inv).to('cpu').to(torch.bfloat16).contiguous() - up_weight = weight_dequant(up_fp8, up_scale_inv).to('cpu').to(torch.bfloat16).contiguous() - down_weight = weight_dequant(down_fp8, down_scale_inv).to('cpu').to(torch.bfloat16).contiguous() + gate_weight = weight_dequant(gate_fp8, gate_scale_inv).to("cpu").to(torch.bfloat16).contiguous() + up_weight = weight_dequant(up_fp8, up_scale_inv).to("cpu").to(torch.bfloat16).contiguous() + down_weight = weight_dequant(down_fp8, down_scale_inv).to("cpu").to(torch.bfloat16).contiguous() elif self.input_type == "fp16": # Load FP16 and convert to BF16 @@ -804,6 +796,7 @@ class OnlineQuantConverter(ConverterBase): print(f" Keeping layer folder structure at {self.output_path}/_layer_{layer_idx}") return {} + """ Example usage(test passed): python convert_cpu_weights.py --input-path /mnt/data3/models/DeepSeek-R1-0528/ --input-type fp8 --output /mnt/data3/models/DeepSeek-R1-0528-INT4-test --quant-method int4 --cpuinfer-threads 60 --threadpool-count 2 @@ -811,6 +804,7 @@ python convert_cpu_weights.py --input-path /mnt/data3/models/DeepSeek-R1-0528/ - python convert_cpu_weights.py --input-path /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct --input-type bf16 --output /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct-INT4-test --quant-method int4 --cpuinfer-threads 60 --threadpool-count 2 """ + def main(): parser = argparse.ArgumentParser(description="Convert SafeTensors to column major 1D format") parser.add_argument("--input-path", "-i", required=True, help="Input directory with safetensors") @@ -873,12 +867,25 @@ def main(): if quant_method == "awq": converter = AWQToColumnMajorConverter( - args.input_path, args.output, model_config, args.cpuinfer_threads, args.threadpool_count, input_type=None, merge_to_safetensor=merge_to_safetensor + args.input_path, + args.output, + model_config, + args.cpuinfer_threads, + args.threadpool_count, + input_type=None, + merge_to_safetensor=merge_to_safetensor, ) elif quant_method in ["int4", "int8"] and args.input_type in ["fp8", "fp16", "bf16"]: # Use OnlineQuantConverter for both INT4 and INT8 quantization converter = OnlineQuantConverter( - args.input_path, args.output, model_config, args.cpuinfer_threads, args.threadpool_count, args.input_type, quant_method, merge_to_safetensor + args.input_path, + args.output, + model_config, + args.cpuinfer_threads, + args.threadpool_count, + args.input_type, + quant_method, + merge_to_safetensor, ) else: raise ValueError( diff --git a/kt-kernel/scripts/convert_gpu_weights.py b/kt-kernel/scripts/convert_gpu_weights.py index 05f2490..96cde2e 100644 --- a/kt-kernel/scripts/convert_gpu_weights.py +++ b/kt-kernel/scripts/convert_gpu_weights.py @@ -34,63 +34,42 @@ from datasets import load_dataset def parse_args(): parser = argparse.ArgumentParser(description="Quantize MoE models with selective quantization") - + # Required arguments - parser.add_argument( - "--model_id", - type=str, - required=True, - help="Path to the input model directory" - ) - parser.add_argument( - "--output_dir", - type=str, - required=True, - help="Path to save the quantized model" - ) - + parser.add_argument("--model_id", type=str, required=True, help="Path to the input model directory") + parser.add_argument("--output_dir", type=str, required=True, help="Path to save the quantized model") + # Optional arguments parser.add_argument( "--quant_type", type=str, choices=["W4A16", "W8A16"], default="W8A16", - help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16" + help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16", ) parser.add_argument( - "--num_calibration_samples", - type=int, - default=512, - help="Number of calibration samples. Default: 512" + "--num_calibration_samples", type=int, default=512, help="Number of calibration samples. Default: 512" ) parser.add_argument( - "--max_sequence_length", - type=int, - default=2048, - help="Maximum sequence length for calibration. Default: 2048" + "--max_sequence_length", type=int, default=2048, help="Maximum sequence length for calibration. Default: 2048" ) parser.add_argument( "--dampening_frac", type=float, default=0.1, - help="Dampening fraction to mitigate quantization noise. Default: 0.1" + help="Dampening fraction to mitigate quantization noise. Default: 0.1", ) parser.add_argument( "--dataset", type=str, default="HuggingFaceH4/ultrachat_200k", - help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k" + help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k", ) parser.add_argument( - "--dataset_split", - type=str, - default="train_sft", - help="Dataset split to use. Default: train_sft" + "--dataset_split", type=str, default="train_sft", help="Dataset split to use. Default: train_sft" ) parser.add_argument( - "--force_cpu", - action="store_true", - help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')" + "--force_cpu", action="store_true", help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')" ) parser.add_argument( "--ignore_patterns", @@ -103,29 +82,22 @@ def parse_args(): r"re:.*\.shared_expert\..*$", r"re:.*\.shared_experts\..*$", r"re:.*\.mlp\.shared_expert_gate$", - r"re:.*\.linear_attn\..*$" + r"re:.*\.linear_attn\..*$", ], - help="Regex patterns for layers to ignore during quantization" + help="Regex patterns for layers to ignore during quantization", ) parser.add_argument( "--torch_dtype", type=str, choices=["bfloat16", "float16", "float32"], default="bfloat16", - help="PyTorch dtype for model loading. Default: bfloat16" + help="PyTorch dtype for model loading. Default: bfloat16", ) parser.add_argument( - "--trust_remote_code", - action="store_true", - help="Allow loading of remote code (required for some models)" + "--trust_remote_code", action="store_true", help="Allow loading of remote code (required for some models)" ) - parser.add_argument( - "--random_seed", - type=int, - default=42, - help="Random seed for dataset shuffling. Default: 42" - ) - + parser.add_argument("--random_seed", type=int, default=42, help="Random seed for dataset shuffling. Default: 42") + return parser.parse_args() @@ -152,11 +124,7 @@ def get_torch_dtype(dtype_str): Returns: torch.dtype: Corresponding PyTorch dtype """ - dtype_map = { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "float32": torch.float32 - } + dtype_map = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32} return dtype_map[dtype_str] @@ -176,18 +144,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote Updated ignore_patterns list with dense layer patterns added """ print("🔍 Checking model configuration for dense layers...") - + try: # Load model configuration config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code) - + # Check if the model has first_k_dense_replace parameter - first_k_dense_replace = getattr(config, 'first_k_dense_replace', None) - + first_k_dense_replace = getattr(config, "first_k_dense_replace", None) + if first_k_dense_replace is not None and first_k_dense_replace > 0: print(f"✅ Found dense layers configuration: first_k_dense_replace = {first_k_dense_replace}") print(f" Adding first {first_k_dense_replace} layers to ignore list...") - + # Create regex pattern for dense layers (layers 0 to first_k_dense_replace-1) if first_k_dense_replace == 1: dense_pattern = r"re:model\.layers\.0\.mlp\..*$" @@ -195,18 +163,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote # For multiple layers, use range pattern layer_range = f"[0-{first_k_dense_replace-1}]" dense_pattern = f"re:model\\.layers\\.{layer_range}\\.mlp\\..*$" - + # Add the dense layer pattern to ignore list updated_ignore_patterns = ignore_patterns + [dense_pattern] - + print(f" Dense layer pattern added: {dense_pattern}") print(f" This will ignore MLP components in layers 0-{first_k_dense_replace-1}") - + return updated_ignore_patterns else: print("ℹ️ No dense layers detected (first_k_dense_replace not found or is 0)") return ignore_patterns - + except Exception as e: print(f"⚠️ Warning: Could not check model config for dense layers: {e}") print(" Proceeding with original ignore patterns...") @@ -246,11 +214,7 @@ def load_and_prepare_dataset(dataset_name, dataset_split, num_samples, max_lengt # Tokenize the data def tokenize(sample): return tokenizer( - sample["text"], - padding=False, - max_length=max_length, - truncation=True, - add_special_tokens=False + sample["text"], padding=False, max_length=max_length, truncation=True, add_special_tokens=False ) ds = ds.map(tokenize, remove_columns=ds.column_names) @@ -291,9 +255,7 @@ def main(): # 0) Check for dense layers and update ignore patterns # Dense layers in the first few layers should not be quantized updated_ignore_patterns = check_dense_layers_and_update_ignore( - args.model_id, - args.ignore_patterns, - args.trust_remote_code + args.model_id, args.ignore_patterns, args.trust_remote_code ) # -------------------------------------------------------------------- @@ -302,13 +264,9 @@ def main(): print("🔍 Inferring device map...") with init_empty_weights(): dummy = AutoModelForCausalLM.from_pretrained( - args.model_id, - torch_dtype=torch_dtype, - trust_remote_code=args.trust_remote_code - ) - device_map = infer_auto_device_map( - dummy, no_split_module_classes=dummy._no_split_modules + args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code ) + device_map = infer_auto_device_map(dummy, no_split_module_classes=dummy._no_split_modules) del dummy # Force all modules to CPU for quantization @@ -335,7 +293,7 @@ def main(): args.num_calibration_samples, args.max_sequence_length, tokenizer, - args.random_seed + args.random_seed, ) # -------------------------------------------------------------------- @@ -373,4 +331,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py b/kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py index 01a2c87..f91f024 100644 --- a/kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py +++ b/kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py @@ -9,6 +9,7 @@ from safetensors.torch import load_file, save_file import gc + def weight_dequant_cpu(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor: assert x.dim() == 2 and s.dim() == 2, "Expect 2D tensors for x and s" M, N = x.shape @@ -27,6 +28,7 @@ def weight_dequant_cpu(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) y[m0:m1, n0:n1] = sub.to(torch.bfloat16) return y + def main(fp8_path, bf16_path): torch.set_default_dtype(torch.bfloat16) os.makedirs(bf16_path, exist_ok=True) @@ -34,7 +36,7 @@ def main(fp8_path, bf16_path): with open(model_index_file, "r") as f: model_index = json.load(f) weight_map = model_index["weight_map"] - + loaded_files = {} fp8_weight_names = [] @@ -51,7 +53,7 @@ def main(fp8_path, bf16_path): file_name = os.path.basename(safetensor_file) current_state_dict = load_file(safetensor_file, device="cpu") loaded_files[file_name] = current_state_dict - + new_state_dict = {} for weight_name, weight in current_state_dict.items(): if weight_name.endswith("_scale_inv"): @@ -67,17 +69,17 @@ def main(fp8_path, bf16_path): new_state_dict[weight_name] = weight else: new_state_dict[weight_name] = weight - + new_safetensor_file = os.path.join(bf16_path, file_name) save_file(new_state_dict, new_safetensor_file) - + if len(loaded_files) > 2: oldest_file = next(iter(loaded_files)) del loaded_files[oldest_file] gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() - + new_model_index_file = os.path.join(bf16_path, "model.safetensors.index.json") for weight_name in fp8_weight_names: scale_inv_name = f"{weight_name}_scale_inv" @@ -87,9 +89,10 @@ def main(fp8_path, bf16_path): json.dump({"metadata": {}, "weight_map": weight_map}, f, indent=2) print(f"Finish, Result in: {bf16_path}") + if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--input-fp8-hf-path", type=str, required=True, help="Kimi-K2 FP8 model") parser.add_argument("--output-bf16-hf-path", type=str, required=True, help="BF16 model (After convert)") args = parser.parse_args() - main(args.input_fp8_hf_path, args.output_bf16_hf_path) \ No newline at end of file + main(args.input_fp8_hf_path, args.output_bf16_hf_path) diff --git a/kt-kernel/scripts/convert_moe_to_bf16.py b/kt-kernel/scripts/convert_moe_to_bf16.py index d618472..87e3473 100644 --- a/kt-kernel/scripts/convert_moe_to_bf16.py +++ b/kt-kernel/scripts/convert_moe_to_bf16.py @@ -48,9 +48,7 @@ def _dequantize_tensor( if scales.numel() == weight.numel(): scales = scales.reshape_as(weight) else: - raise ValueError( - f"Scale shape {scales.shape} incompatible with weight shape {weight.shape}" - ) + raise ValueError(f"Scale shape {scales.shape} incompatible with weight shape {weight.shape}") bf16 = (weight.to(torch.float32) * scales).to(torch.bfloat16) return bf16.contiguous() @@ -128,9 +126,7 @@ def convert_file( os.makedirs(os.path.dirname(output_path), exist_ok=True) save_file(tensors, output_path) - print( - f"[done] wrote {output_path} (converted={stats['converted']}, skipped={stats['skipped']})" - ) + print(f"[done] wrote {output_path} (converted={stats['converted']}, skipped={stats['skipped']})") def parse_args() -> argparse.Namespace: @@ -174,9 +170,7 @@ def main(): targets = [os.path.join(model_dir, fname) for fname in args.files] else: targets = [ - os.path.join(model_dir, name) - for name in sorted(os.listdir(model_dir)) - if name.endswith(".safetensors") + os.path.join(model_dir, name) for name in sorted(os.listdir(model_dir)) if name.endswith(".safetensors") ] if not targets: diff --git a/kt-kernel/scripts/install-git-hooks.sh b/kt-kernel/scripts/install-git-hooks.sh index be3b03e..b1d0beb 100755 --- a/kt-kernel/scripts/install-git-hooks.sh +++ b/kt-kernel/scripts/install-git-hooks.sh @@ -1,24 +1,29 @@ #!/usr/bin/env sh -# Install git hooks from .githooks into .git/hooks by creating symlinks (or copying if symlink fails). +# Install git hooks from kt-kernel/.githooks into the monorepo's .git/hooks by +# creating symlinks (or copying if symlink fails). set -eu +# This script lives in kt-kernel/scripts/, so REPO_ROOT = kt-kernel REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" -GIT_DIR="$REPO_ROOT/.git" HOOKS_SRC="$REPO_ROOT/.githooks" + +# Detect the top-level Git worktree (the monorepo root: ktransformers) +GIT_TOP="$(git rev-parse --show-toplevel 2>/dev/null || true)" +if [ -z "$GIT_TOP" ] || [ ! -d "$GIT_TOP/.git" ]; then + echo "[install-git-hooks] Not inside a git worktree; skipping hooks installation." >&2 + exit 0 +fi + +GIT_DIR="$GIT_TOP/.git" HOOKS_DEST="$GIT_DIR/hooks" -if [ ! -d "$GIT_DIR" ]; then - echo "Not a git repository (no .git directory) at $REPO_ROOT" >&2 - exit 1 -fi - if [ ! -d "$HOOKS_SRC" ]; then - echo "No .githooks directory found at $HOOKS_SRC" >&2 + echo "[install-git-hooks] No .githooks directory found at $HOOKS_SRC" >&2 exit 1 fi -echo "Installing git hooks from $HOOKS_SRC to $HOOKS_DEST" +echo "[install-git-hooks] Installing git hooks from $HOOKS_SRC to $HOOKS_DEST (repo: $GIT_TOP)" # Ensure all source hook files are executable so that even if copied (not symlinked) they run. for src_hook in "$HOOKS_SRC"/*; do @@ -49,4 +54,4 @@ for hook in "$HOOKS_SRC"/*; do fi done -echo "Done. Hooks installed." +echo "[install-git-hooks] Done. Hooks installed."