From aef6672dd84024aef8a1355306e3b6a7a0bf03fc Mon Sep 17 00:00:00 2001
From: ZiWei Yuan <yzwliam@126.com>
Date: Sat, 15 Nov 2025 18:26:49 +0800
Subject: [PATCH] [docs]: add contribuing guide and add hooks install (#1613)

* [feat]: update kt-kernel hooks and add contribution guide

* [docs]: add contributing guide
* [style]: format the python file and cpp file in kt-kernel
---
 .github/CONTRIBUTING.md                       | 139 ++++++++++++++++++
 kt-kernel/.githooks/pre-commit                |  37 +++--
 kt-kernel/CMakeLists.txt                      |  45 ++++--
 kt-kernel/cpu_backend/shared_mem_buffer.cpp   |   2 +-
 kt-kernel/operators/moe-tp.hpp                |   1 +
 kt-kernel/operators/moe_kernel/moe.hpp        |   2 +-
 kt-kernel/scripts/convert_cpu_weights.py      |  67 +++++----
 kt-kernel/scripts/convert_gpu_weights.py      | 108 +++++---------
 .../convert_kimi_k2_fp8_to_bf16_cpu.py        |  15 +-
 kt-kernel/scripts/convert_moe_to_bf16.py      |  12 +-
 kt-kernel/scripts/install-git-hooks.sh        |  25 ++--
 11 files changed, 289 insertions(+), 164 deletions(-)
 create mode 100644 .github/CONTRIBUTING.md

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 0000000..c16debf
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,139 @@
+## Before Commit!
+
+Your commit message must follow Conventional Commits (https://www.conventionalcommits.org/) and your code should be formatted. The Git hooks will do most of the work automatically:
+
+### Tool Requirements
+
+You need a recent `clang-format` (>= 18). In a conda environment you can install:
+
+```shell
+conda install -c conda-forge clang-format=18
+```
+
+If you previously configured with an older version, remove the build directory and reconfigure:
+
+```shell
+rm -rf kt-kernel/build
+```
+
+Install `black` for Python formatting:
+
+```shell
+conda install black
+```
+
+### Install hook:
+```shell
+bash kt-kernel/scripts/install-git-hooks.sh
+#or just cmake the kt-kernel
+cmake -S kt-kernel -B kt-kernel/build
+```
+
+There are manual commands if you need format.
+
+```shell
+cmake -S kt-kernel -B kt-kernel/build
+cmake --build kt-kernel/build --target format
+```
+
+## Developer Note
+
+Formatting and commit message rules are enforced by Git hooks. After installing `clang-format` and `black`, just commit normally—the hooks will run formatting for you.
+
+> [!NOTE]
+> If formatting modifies files, the commit is aborted after staging those changes. Review them and run `git commit` again. Repeat until no further formatting changes appear.
+
+---
+
+### Conventional Commit Regex (Reference)
+
+The commit-msg hook enforces this pattern:
+
+```text
+regex='^\[(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip)\](\([^\)]+\))?(!)?: .+'
+```
+
+Meaning (English):
+* `[type]` required — one of feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip
+* Optional scope: `(scope)` — any chars except `)`
+* Optional breaking change marker: `!` right after type or scope
+* Separator: `: ` (colon + space)
+* Subject: free text (at least one character)
+
+Examples:
+```text
+[feat]: add adaptive batching
+[fix(parser)]: handle empty token list
+[docs]!: update API section for breaking rename
+```
+
+You can bypass locally (not recommended) with:
+```shell
+git commit --no-verify
+```
+## 提交前提醒
+
+提交信息必须满足 Conventional Commits 规范 (https://www.conventionalcommits.org/)，代码需要符合格式要求。Git 钩子已经集成了大部分工作：
+### 软件要求
+
+需要较新的 `clang-format` (>= 18)，在 conda 环境中安装：
+
+```shell
+conda install -c conda-forge clang-format=18
+```
+
+如果之前用老版本配置过，请删除构建目录重新配置：
+
+```shell
+rm -rf kt-kernel/build
+```
+
+安装 `black` 以进行 Python 文件格式化：
+
+```shell
+conda install black
+```
+### 安装钩子
+```shell
+bash kt-kernel/scripts/install-git-hooks.sh
+#or just cmake the kt-kernel
+cmake -S kt-kernel -B kt-kernel/build
+```
+如果你需要手动格式化：
+```shell
+cmake -S kt-kernel -B kt-kernel/build
+cmake --build kt-kernel/build --target format
+```
+
+## 开发者说明
+
+本仓库通过 Git hooks 自动执行代码格式化与提交信息规范检查。只需安装好 `clang-format` 与 `black` 后正常执行提交即可，钩子会自动格式化。
+
+> [!NOTE]
+> 如果格式化修改了文件，钩子会终止提交并已暂存这些改动。请查看修改后再次执行 `git commit`，重复直到没有新的格式化变更。
+
+### 提交信息正则（参考）
+
+钩子使用如下正则检查提交信息：
+```text
+regex='^\[(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip)\](\([^\)]+\))?(!)?: .+'
+```
+含义：
+* `[type]` 必填：feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip
+* 作用域可选：`(scope)`，不能包含右括号
+* 可选的破坏性标记：`!`
+* 分隔符：冒号+空格 `: `
+* 描述：至少一个字符
+
+示例：
+```text
+[feat]: 增加自适应 batch 功能
+[fix(tokenizer)]: 修复空 token 列表处理
+[docs]!: 更新接口文档（存在破坏性修改）
+```
+
+跳过钩子（不推荐，仅紧急时）：
+```shell
+git commit --no-verify
+```
+
diff --git a/kt-kernel/.githooks/pre-commit b/kt-kernel/.githooks/pre-commit
index bdebb12..e3d42e0 100755
--- a/kt-kernel/.githooks/pre-commit
+++ b/kt-kernel/.githooks/pre-commit
@@ -1,10 +1,12 @@
 #!/usr/bin/bash
-# Pre-commit hook: run clang-format via CMake 'format' target and Black for Python before allowing commit.
-# If formatting makes changes, stage them and abort so user can review.
+# Pre-commit hook: run clang-format via kt-kernel's CMake 'format' target and Black for Python
+# before allowing commit. If formatting makes changes, stage them and abort so user can review.
 set -euo pipefail
 
 REPO_ROOT="$(git rev-parse --show-toplevel)"
-BUILD_DIR="$REPO_ROOT/build"
+# kt-kernel project directory within the monorepo
+KERNEL_DIR="$REPO_ROOT/kt-kernel"
+BUILD_DIR="$KERNEL_DIR/build"
 FORMAT_TARGET="format"
 CLANG_FORMAT_BIN="${CLANG_FORMAT_BIN:-clang-format}"
 BLACK_BIN="${BLACK_BIN:-black}"
@@ -20,10 +22,10 @@ if ! command -v "$BLACK_BIN" >/dev/null 2>&1; then
   echo "[pre-commit] black not found (looked for $BLACK_BIN). Skipping Python format." >&2
 fi
 
-# Configure build directory if missing (quiet)
-if [ ! -d "$BUILD_DIR" ] || [ ! -f "$BUILD_DIR/Makefile" ] && [ ! -f "$BUILD_DIR/build.ninja" ]; then
-  echo "[pre-commit] configuring project (cmake) ..." >&2
-  cmake -S "$REPO_ROOT" -B "$BUILD_DIR" >/dev/null
+# Configure kt-kernel build directory if missing (quiet)
+if [ ! -d "$BUILD_DIR" ] || { [ ! -f "$BUILD_DIR/Makefile" ] && [ ! -f "$BUILD_DIR/build.ninja" ]; }; then
+  echo "[pre-commit] configuring kt-kernel (cmake) ..." >&2
+  cmake -S "$KERNEL_DIR" -B "$BUILD_DIR" >/dev/null
 fi
 
 # Run format target (prefer ninja if present)
@@ -38,15 +40,18 @@ fi
 
 # Run black on staged python files (or entire repo if you prefer)
 if command -v "$BLACK_BIN" >/dev/null 2>&1; then
-  # Get staged python files; if none, skip
-  PY_FILES=$(git diff --cached --name-only --diff-filter=ACM | grep -E '\.py$' || true)
-  if [ -n "$PY_FILES" ]; then
-    echo "[pre-commit] running black on staged python files..." >&2
-    $BLACK_BIN $PY_FILES
-  else
-    # Optionally format all python files; comment out if not desired
-    # $BLACK_BIN "$REPO_ROOT"
-    :
+  # Run black only on kt-kernel's python and scripts directories
+  BLACK_PATHS=""
+  if [ -d "$KERNEL_DIR/python" ]; then
+    BLACK_PATHS="$BLACK_PATHS $KERNEL_DIR/python"
+  fi
+  if [ -d "$KERNEL_DIR/scripts" ]; then
+    BLACK_PATHS="$BLACK_PATHS $KERNEL_DIR/scripts"
+  fi
+  if [ -n "$BLACK_PATHS" ]; then
+    echo "[pre-commit] running black on:$BLACK_PATHS" >&2
+    # shellcheck disable=SC2086
+    $BLACK_BIN $BLACK_PATHS
   fi
 fi
 
diff --git a/kt-kernel/CMakeLists.txt b/kt-kernel/CMakeLists.txt
index 24f092d..6429b7b 100644
--- a/kt-kernel/CMakeLists.txt
+++ b/kt-kernel/CMakeLists.txt
@@ -79,27 +79,40 @@ if(USE_CONDA_TOOLCHAIN)
     set(CMAKE_INSTALL_RPATH_USE_LINK_PATH OFF)
 endif()
 
-## Ensure git hooks are installed when configuring the project
-# If this is a git working copy and the installer exists, run it and fail the CMake configure
-# when installation fails. If no .git directory is present (e.g. source tarball), skip.
-if(EXISTS "${CMAKE_SOURCE_DIR}/.git" AND IS_DIRECTORY "${CMAKE_SOURCE_DIR}/.git")
-    if(EXISTS "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh")
-        message(STATUS "Detected .git; installing git hooks using scripts/install-git-hooks.sh")
-        execute_process(
-            COMMAND sh "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh"
-            WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-            RESULT_VARIABLE _INSTALL_GIT_HOOKS_RESULT
-            OUTPUT_VARIABLE _INSTALL_GIT_HOOKS_OUT
-            ERROR_VARIABLE _INSTALL_GIT_HOOKS_ERR
+## Ensure git hooks are installed when configuring the project (monorepo-aware)
+# If we are inside a git worktree (repo root is outside kt-kernel now), invoke the installer
+# which will link kt-kernel/.githooks into the top-level .git/hooks. Otherwise, skip.
+find_program(GIT_BIN git)
+if(GIT_BIN)
+    execute_process(
+        COMMAND "${GIT_BIN}" rev-parse --show-toplevel
+        WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+        OUTPUT_VARIABLE _GIT_TOP
+        RESULT_VARIABLE _GIT_RV
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    if(_GIT_RV EQUAL 0 AND EXISTS "${_GIT_TOP}/.git" AND IS_DIRECTORY "${_GIT_TOP}/.git")
+        if(EXISTS "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh")
+            message(STATUS "Detected git worktree at ${_GIT_TOP}; installing hooks from kt-kernel/.githooks")
+            execute_process(
+                COMMAND sh "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh"
+                WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+                RESULT_VARIABLE _INSTALL_GIT_HOOKS_RESULT
+                OUTPUT_VARIABLE _INSTALL_GIT_HOOKS_OUT
+                ERROR_VARIABLE _INSTALL_GIT_HOOKS_ERR
             )
-        if(NOT _INSTALL_GIT_HOOKS_RESULT EQUAL 0)
-            message(FATAL_ERROR "Installing git hooks failed (exit ${_INSTALL_GIT_HOOKS_RESULT}).\nOutput:\n${_INSTALL_GIT_HOOKS_OUT}\nError:\n${_INSTALL_GIT_HOOKS_ERR}")
+            if(NOT _INSTALL_GIT_HOOKS_RESULT EQUAL 0)
+                message(FATAL_ERROR "Installing git hooks failed (exit ${_INSTALL_GIT_HOOKS_RESULT}).\nOutput:\n${_INSTALL_GIT_HOOKS_OUT}\nError:\n${_INSTALL_GIT_HOOKS_ERR}")
+            endif()
+        else()
+            message(FATAL_ERROR "Required script 'scripts/install-git-hooks.sh' not found in kt-kernel; cannot install hooks.")
         endif()
     else()
-        message(FATAL_ERROR "Repository appears to be a git repo but required script 'scripts/install-git-hooks.sh' was not found. Please ensure hooks installer is present.")
+        message(STATUS "No git worktree detected; skipping git hooks installation")
     endif()
 else()
-    message(STATUS "No .git directory found; skipping git hooks installation")
+    message(STATUS "git not found; skipping git hooks installation")
 endif()
 
 set(CMAKE_CXX_STANDARD 20)
diff --git a/kt-kernel/cpu_backend/shared_mem_buffer.cpp b/kt-kernel/cpu_backend/shared_mem_buffer.cpp
index c6b04d0..4d74ce1 100644
--- a/kt-kernel/cpu_backend/shared_mem_buffer.cpp
+++ b/kt-kernel/cpu_backend/shared_mem_buffer.cpp
@@ -9,10 +9,10 @@
  **/
 #include "shared_mem_buffer.h"
 
+#include <errno.h>
 #include <numa.h>
 
 #include <cstdio>
-#include <errno.h>
 
 size_t MemoryRequest::total_size() {
   size_t total = 0;
diff --git a/kt-kernel/operators/moe-tp.hpp b/kt-kernel/operators/moe-tp.hpp
index 7e400b2..7130715 100644
--- a/kt-kernel/operators/moe-tp.hpp
+++ b/kt-kernel/operators/moe-tp.hpp
@@ -7,6 +7,7 @@
 #include <cstdio>
 #include <type_traits>
 
+#include "../cpu_backend/shared_mem_buffer.h"
 #include "common.hpp"
 
 // Forward declaration for Llamafile backend type checking
diff --git a/kt-kernel/operators/moe_kernel/moe.hpp b/kt-kernel/operators/moe_kernel/moe.hpp
index 59e58d0..c5d3acb 100644
--- a/kt-kernel/operators/moe_kernel/moe.hpp
+++ b/kt-kernel/operators/moe_kernel/moe.hpp
@@ -13,11 +13,11 @@
 #include <vector>
 
 #include "../common.hpp"
+#include "../cpu_backend/shared_mem_buffer.h"
 #include "../moe-tp.hpp"
 #include "api/common.h"
 #include "api/mat_kernel.h"
 #include "llama.cpp/ggml.h"
-
 template <class T, bool PLAIN = true>
 class MOE_KERNEL_TP
 #ifdef FORWARD_TIME_PROFILE
diff --git a/kt-kernel/scripts/convert_cpu_weights.py b/kt-kernel/scripts/convert_cpu_weights.py
index 520e873..92f3a44 100644
--- a/kt-kernel/scripts/convert_cpu_weights.py
+++ b/kt-kernel/scripts/convert_cpu_weights.py
@@ -22,7 +22,6 @@ import triton
 import triton.language as tl
 
 
-
 Q_BITS = 4
 STORAGE_BITS = 32
 PACK_NUM = STORAGE_BITS // Q_BITS
@@ -31,6 +30,7 @@ NUMA_NUM = 2
 REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
+
 @triton.jit
 def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
     pid_m = tl.program_id(axis=0)
@@ -51,10 +51,11 @@ def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> t
     assert x.dim() == 2 and s.dim() == 2
     M, N = x.size()
     y = torch.empty_like(x, dtype=torch.get_default_dtype())
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE']))
+    grid = lambda meta: (triton.cdiv(M, meta["BLOCK_SIZE"]), triton.cdiv(N, meta["BLOCK_SIZE"]))
     weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
     return y
 
+
 def load_model_config(input_path: str, input_type: str = None) -> Dict:
     """Load model configuration from config.json
 
@@ -297,7 +298,6 @@ class ConverterBase:
         handle = self.file_handle_map[file]
         return handle.get_tensor(key)
 
-
     # layers_id -> list[experts_id]
     def _find_expert_layers(self) -> Dict[int, List[int]]:
         """Find all layers and experts in the model"""
@@ -517,7 +517,9 @@ class OnlineQuantConverter(ConverterBase):
         quant_method: str = "int4",
         merge_to_safetensor: bool = True,
     ):
-        super().__init__(input_path, output_path, model_config, cpuinfer_threads, threadpool_count, input_type, merge_to_safetensor)
+        super().__init__(
+            input_path, output_path, model_config, cpuinfer_threads, threadpool_count, input_type, merge_to_safetensor
+        )
         self.quant_method = quant_method
 
         # For FP8, get block size from model_config
@@ -569,11 +571,11 @@ class OnlineQuantConverter(ConverterBase):
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"File not found: {file_path}")
 
-        with open(file_path, 'rb') as f:
+        with open(file_path, "rb") as f:
             binary_data = f.read()
 
         # Determine dtype based on file name
-        if 'scale' in file_path:
+        if "scale" in file_path:
             # Scale tensors are typically float32
             np_array = np.frombuffer(binary_data, dtype=np.float32)
         else:
@@ -616,22 +618,12 @@ class OnlineQuantConverter(ConverterBase):
             # Iterate through all experts
             for expert_id in range(self.num_experts):
                 # For each projection (down, gate, up)
-                proj_mappings = [
-                    ('down', 'ffn_down_exps'),
-                    ('gate', 'ffn_gate_exps'),
-                    ('up', 'ffn_up_exps')
-                ]
+                proj_mappings = [("down", "ffn_down_exps"), ("gate", "ffn_gate_exps"), ("up", "ffn_up_exps")]
 
                 for proj_name, proj_key in proj_mappings:
                     # Build file patterns
-                    quant_pattern = os.path.join(
-                        numa_folder,
-                        f'{amx_method}_{proj_name}_{expert_id}_*Byte_quant_.kt'
-                    )
-                    scale_pattern = os.path.join(
-                        numa_folder,
-                        f'{amx_method}_{proj_name}_{expert_id}_*Byte_scale_.kt'
-                    )
+                    quant_pattern = os.path.join(numa_folder, f"{amx_method}_{proj_name}_{expert_id}_*Byte_quant_.kt")
+                    scale_pattern = os.path.join(numa_folder, f"{amx_method}_{proj_name}_{expert_id}_*Byte_scale_.kt")
 
                     # Find files using glob
                     quant_files = glob.glob(quant_pattern)
@@ -705,18 +697,18 @@ class OnlineQuantConverter(ConverterBase):
                     raise KeyError(f"Missing down weight_scale_inv for layer {layer_idx}, expert {expert_id}")
 
                 # Load FP8 weights and scales
-                gate_fp8 = self._load_tensor(gate_key).to('cuda')
-                up_fp8 = self._load_tensor(up_key).to('cuda')
-                down_fp8 = self._load_tensor(down_key).to('cuda')
+                gate_fp8 = self._load_tensor(gate_key).to("cuda")
+                up_fp8 = self._load_tensor(up_key).to("cuda")
+                down_fp8 = self._load_tensor(down_key).to("cuda")
 
-                gate_scale_inv = self._load_tensor(gate_scale_key).to('cuda')
-                up_scale_inv = self._load_tensor(up_scale_key).to('cuda')
-                down_scale_inv = self._load_tensor(down_scale_key).to('cuda')
+                gate_scale_inv = self._load_tensor(gate_scale_key).to("cuda")
+                up_scale_inv = self._load_tensor(up_scale_key).to("cuda")
+                down_scale_inv = self._load_tensor(down_scale_key).to("cuda")
 
                 # Dequantize FP8 to BF16 using block-wise scaling
-                gate_weight = weight_dequant(gate_fp8, gate_scale_inv).to('cpu').to(torch.bfloat16).contiguous()
-                up_weight = weight_dequant(up_fp8, up_scale_inv).to('cpu').to(torch.bfloat16).contiguous()
-                down_weight = weight_dequant(down_fp8, down_scale_inv).to('cpu').to(torch.bfloat16).contiguous()
+                gate_weight = weight_dequant(gate_fp8, gate_scale_inv).to("cpu").to(torch.bfloat16).contiguous()
+                up_weight = weight_dequant(up_fp8, up_scale_inv).to("cpu").to(torch.bfloat16).contiguous()
+                down_weight = weight_dequant(down_fp8, down_scale_inv).to("cpu").to(torch.bfloat16).contiguous()
 
             elif self.input_type == "fp16":
                 # Load FP16 and convert to BF16
@@ -804,6 +796,7 @@ class OnlineQuantConverter(ConverterBase):
             print(f"  Keeping layer folder structure at {self.output_path}/_layer_{layer_idx}")
             return {}
 
+
 """
 Example usage(test passed):
 python convert_cpu_weights.py --input-path /mnt/data3/models/DeepSeek-R1-0528/ --input-type fp8 --output /mnt/data3/models/DeepSeek-R1-0528-INT4-test --quant-method int4 --cpuinfer-threads 60 --threadpool-count 2
@@ -811,6 +804,7 @@ python convert_cpu_weights.py --input-path /mnt/data3/models/DeepSeek-R1-0528/ -
 python convert_cpu_weights.py --input-path /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct --input-type bf16 --output /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct-INT4-test --quant-method int4 --cpuinfer-threads 60 --threadpool-count 2
 """
 
+
 def main():
     parser = argparse.ArgumentParser(description="Convert SafeTensors to column major 1D format")
     parser.add_argument("--input-path", "-i", required=True, help="Input directory with safetensors")
@@ -873,12 +867,25 @@ def main():
 
         if quant_method == "awq":
             converter = AWQToColumnMajorConverter(
-                args.input_path, args.output, model_config, args.cpuinfer_threads, args.threadpool_count, input_type=None, merge_to_safetensor=merge_to_safetensor
+                args.input_path,
+                args.output,
+                model_config,
+                args.cpuinfer_threads,
+                args.threadpool_count,
+                input_type=None,
+                merge_to_safetensor=merge_to_safetensor,
             )
         elif quant_method in ["int4", "int8"] and args.input_type in ["fp8", "fp16", "bf16"]:
             # Use OnlineQuantConverter for both INT4 and INT8 quantization
             converter = OnlineQuantConverter(
-                args.input_path, args.output, model_config, args.cpuinfer_threads, args.threadpool_count, args.input_type, quant_method, merge_to_safetensor
+                args.input_path,
+                args.output,
+                model_config,
+                args.cpuinfer_threads,
+                args.threadpool_count,
+                args.input_type,
+                quant_method,
+                merge_to_safetensor,
             )
         else:
             raise ValueError(
diff --git a/kt-kernel/scripts/convert_gpu_weights.py b/kt-kernel/scripts/convert_gpu_weights.py
index 05f2490..96cde2e 100644
--- a/kt-kernel/scripts/convert_gpu_weights.py
+++ b/kt-kernel/scripts/convert_gpu_weights.py
@@ -34,63 +34,42 @@ from datasets import load_dataset
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Quantize MoE models with selective quantization")
-    
+
     # Required arguments
-    parser.add_argument(
-        "--model_id",
-        type=str,
-        required=True,
-        help="Path to the input model directory"
-    )
-    parser.add_argument(
-        "--output_dir", 
-        type=str,
-        required=True,
-        help="Path to save the quantized model"
-    )
-    
+    parser.add_argument("--model_id", type=str, required=True, help="Path to the input model directory")
+    parser.add_argument("--output_dir", type=str, required=True, help="Path to save the quantized model")
+
     # Optional arguments
     parser.add_argument(
         "--quant_type",
         type=str,
         choices=["W4A16", "W8A16"],
         default="W8A16",
-        help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16"
+        help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16",
     )
     parser.add_argument(
-        "--num_calibration_samples",
-        type=int,
-        default=512,
-        help="Number of calibration samples. Default: 512"
+        "--num_calibration_samples", type=int, default=512, help="Number of calibration samples. Default: 512"
     )
     parser.add_argument(
-        "--max_sequence_length",
-        type=int,
-        default=2048,
-        help="Maximum sequence length for calibration. Default: 2048"
+        "--max_sequence_length", type=int, default=2048, help="Maximum sequence length for calibration. Default: 2048"
     )
     parser.add_argument(
         "--dampening_frac",
         type=float,
         default=0.1,
-        help="Dampening fraction to mitigate quantization noise. Default: 0.1"
+        help="Dampening fraction to mitigate quantization noise. Default: 0.1",
     )
     parser.add_argument(
         "--dataset",
         type=str,
         default="HuggingFaceH4/ultrachat_200k",
-        help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k"
+        help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k",
     )
     parser.add_argument(
-        "--dataset_split",
-        type=str,
-        default="train_sft",
-        help="Dataset split to use. Default: train_sft"
+        "--dataset_split", type=str, default="train_sft", help="Dataset split to use. Default: train_sft"
     )
     parser.add_argument(
-        "--force_cpu",
-        action="store_true",
-        help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
+        "--force_cpu", action="store_true", help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
     )
     parser.add_argument(
         "--ignore_patterns",
@@ -103,29 +82,22 @@ def parse_args():
             r"re:.*\.shared_expert\..*$",
             r"re:.*\.shared_experts\..*$",
             r"re:.*\.mlp\.shared_expert_gate$",
-            r"re:.*\.linear_attn\..*$"
+            r"re:.*\.linear_attn\..*$",
         ],
-        help="Regex patterns for layers to ignore during quantization"
+        help="Regex patterns for layers to ignore during quantization",
     )
     parser.add_argument(
         "--torch_dtype",
         type=str,
         choices=["bfloat16", "float16", "float32"],
         default="bfloat16",
-        help="PyTorch dtype for model loading. Default: bfloat16"
+        help="PyTorch dtype for model loading. Default: bfloat16",
     )
     parser.add_argument(
-        "--trust_remote_code",
-        action="store_true",
-        help="Allow loading of remote code (required for some models)"
+        "--trust_remote_code", action="store_true", help="Allow loading of remote code (required for some models)"
     )
-    parser.add_argument(
-        "--random_seed",
-        type=int,
-        default=42,
-        help="Random seed for dataset shuffling. Default: 42"
-    )
-    
+    parser.add_argument("--random_seed", type=int, default=42, help="Random seed for dataset shuffling. Default: 42")
+
     return parser.parse_args()
 
 
@@ -152,11 +124,7 @@ def get_torch_dtype(dtype_str):
     Returns:
         torch.dtype: Corresponding PyTorch dtype
     """
-    dtype_map = {
-        "bfloat16": torch.bfloat16,
-        "float16": torch.float16,
-        "float32": torch.float32
-    }
+    dtype_map = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}
     return dtype_map[dtype_str]
 
 
@@ -176,18 +144,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote
         Updated ignore_patterns list with dense layer patterns added
     """
     print("🔍 Checking model configuration for dense layers...")
-    
+
     try:
         # Load model configuration
         config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
-        
+
         # Check if the model has first_k_dense_replace parameter
-        first_k_dense_replace = getattr(config, 'first_k_dense_replace', None)
-        
+        first_k_dense_replace = getattr(config, "first_k_dense_replace", None)
+
         if first_k_dense_replace is not None and first_k_dense_replace > 0:
             print(f"✅ Found dense layers configuration: first_k_dense_replace = {first_k_dense_replace}")
             print(f"   Adding first {first_k_dense_replace} layers to ignore list...")
-            
+
             # Create regex pattern for dense layers (layers 0 to first_k_dense_replace-1)
             if first_k_dense_replace == 1:
                 dense_pattern = r"re:model\.layers\.0\.mlp\..*$"
@@ -195,18 +163,18 @@ def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote
                 # For multiple layers, use range pattern
                 layer_range = f"[0-{first_k_dense_replace-1}]"
                 dense_pattern = f"re:model\\.layers\\.{layer_range}\\.mlp\\..*$"
-            
+
             # Add the dense layer pattern to ignore list
             updated_ignore_patterns = ignore_patterns + [dense_pattern]
-            
+
             print(f"   Dense layer pattern added: {dense_pattern}")
             print(f"   This will ignore MLP components in layers 0-{first_k_dense_replace-1}")
-            
+
             return updated_ignore_patterns
         else:
             print("ℹ️  No dense layers detected (first_k_dense_replace not found or is 0)")
             return ignore_patterns
-            
+
     except Exception as e:
         print(f"⚠️  Warning: Could not check model config for dense layers: {e}")
         print("   Proceeding with original ignore patterns...")
@@ -246,11 +214,7 @@ def load_and_prepare_dataset(dataset_name, dataset_split, num_samples, max_lengt
     # Tokenize the data
     def tokenize(sample):
         return tokenizer(
-            sample["text"],
-            padding=False,
-            max_length=max_length,
-            truncation=True,
-            add_special_tokens=False
+            sample["text"], padding=False, max_length=max_length, truncation=True, add_special_tokens=False
         )
 
     ds = ds.map(tokenize, remove_columns=ds.column_names)
@@ -291,9 +255,7 @@ def main():
     # 0) Check for dense layers and update ignore patterns
     # Dense layers in the first few layers should not be quantized
     updated_ignore_patterns = check_dense_layers_and_update_ignore(
-        args.model_id,
-        args.ignore_patterns,
-        args.trust_remote_code
+        args.model_id, args.ignore_patterns, args.trust_remote_code
     )
 
     # --------------------------------------------------------------------
@@ -302,13 +264,9 @@ def main():
     print("🔍 Inferring device map...")
     with init_empty_weights():
         dummy = AutoModelForCausalLM.from_pretrained(
-            args.model_id,
-            torch_dtype=torch_dtype,
-            trust_remote_code=args.trust_remote_code
-        )
-        device_map = infer_auto_device_map(
-            dummy, no_split_module_classes=dummy._no_split_modules
+            args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code
         )
+        device_map = infer_auto_device_map(dummy, no_split_module_classes=dummy._no_split_modules)
         del dummy
 
     # Force all modules to CPU for quantization
@@ -335,7 +293,7 @@ def main():
         args.num_calibration_samples,
         args.max_sequence_length,
         tokenizer,
-        args.random_seed
+        args.random_seed,
     )
 
     # --------------------------------------------------------------------
@@ -373,4 +331,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py b/kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py
index 01a2c87..f91f024 100644
--- a/kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py
+++ b/kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py
@@ -9,6 +9,7 @@ from safetensors.torch import load_file, save_file
 
 import gc
 
+
 def weight_dequant_cpu(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
     assert x.dim() == 2 and s.dim() == 2, "Expect 2D tensors for x and s"
     M, N = x.shape
@@ -27,6 +28,7 @@ def weight_dequant_cpu(x: torch.Tensor, s: torch.Tensor, block_size: int = 128)
             y[m0:m1, n0:n1] = sub.to(torch.bfloat16)
     return y
 
+
 def main(fp8_path, bf16_path):
     torch.set_default_dtype(torch.bfloat16)
     os.makedirs(bf16_path, exist_ok=True)
@@ -34,7 +36,7 @@ def main(fp8_path, bf16_path):
     with open(model_index_file, "r") as f:
         model_index = json.load(f)
     weight_map = model_index["weight_map"]
-    
+
     loaded_files = {}
     fp8_weight_names = []
 
@@ -51,7 +53,7 @@ def main(fp8_path, bf16_path):
         file_name = os.path.basename(safetensor_file)
         current_state_dict = load_file(safetensor_file, device="cpu")
         loaded_files[file_name] = current_state_dict
-        
+
         new_state_dict = {}
         for weight_name, weight in current_state_dict.items():
             if weight_name.endswith("_scale_inv"):
@@ -67,17 +69,17 @@ def main(fp8_path, bf16_path):
                     new_state_dict[weight_name] = weight
             else:
                 new_state_dict[weight_name] = weight
-                
+
         new_safetensor_file = os.path.join(bf16_path, file_name)
         save_file(new_state_dict, new_safetensor_file)
-        
+
         if len(loaded_files) > 2:
             oldest_file = next(iter(loaded_files))
             del loaded_files[oldest_file]
             gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
-    
+
     new_model_index_file = os.path.join(bf16_path, "model.safetensors.index.json")
     for weight_name in fp8_weight_names:
         scale_inv_name = f"{weight_name}_scale_inv"
@@ -87,9 +89,10 @@ def main(fp8_path, bf16_path):
         json.dump({"metadata": {}, "weight_map": weight_map}, f, indent=2)
     print(f"Finish, Result in: {bf16_path}")
 
+
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument("--input-fp8-hf-path", type=str, required=True, help="Kimi-K2 FP8 model")
     parser.add_argument("--output-bf16-hf-path", type=str, required=True, help="BF16 model (After convert)")
     args = parser.parse_args()
-    main(args.input_fp8_hf_path, args.output_bf16_hf_path)
\ No newline at end of file
+    main(args.input_fp8_hf_path, args.output_bf16_hf_path)
diff --git a/kt-kernel/scripts/convert_moe_to_bf16.py b/kt-kernel/scripts/convert_moe_to_bf16.py
index d618472..87e3473 100644
--- a/kt-kernel/scripts/convert_moe_to_bf16.py
+++ b/kt-kernel/scripts/convert_moe_to_bf16.py
@@ -48,9 +48,7 @@ def _dequantize_tensor(
         if scales.numel() == weight.numel():
             scales = scales.reshape_as(weight)
         else:
-            raise ValueError(
-                f"Scale shape {scales.shape} incompatible with weight shape {weight.shape}"
-            )
+            raise ValueError(f"Scale shape {scales.shape} incompatible with weight shape {weight.shape}")
     bf16 = (weight.to(torch.float32) * scales).to(torch.bfloat16)
     return bf16.contiguous()
 
@@ -128,9 +126,7 @@ def convert_file(
 
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     save_file(tensors, output_path)
-    print(
-        f"[done] wrote {output_path} (converted={stats['converted']}, skipped={stats['skipped']})"
-    )
+    print(f"[done] wrote {output_path} (converted={stats['converted']}, skipped={stats['skipped']})")
 
 
 def parse_args() -> argparse.Namespace:
@@ -174,9 +170,7 @@ def main():
         targets = [os.path.join(model_dir, fname) for fname in args.files]
     else:
         targets = [
-            os.path.join(model_dir, name)
-            for name in sorted(os.listdir(model_dir))
-            if name.endswith(".safetensors")
+            os.path.join(model_dir, name) for name in sorted(os.listdir(model_dir)) if name.endswith(".safetensors")
         ]
 
     if not targets:
diff --git a/kt-kernel/scripts/install-git-hooks.sh b/kt-kernel/scripts/install-git-hooks.sh
index be3b03e..b1d0beb 100755
--- a/kt-kernel/scripts/install-git-hooks.sh
+++ b/kt-kernel/scripts/install-git-hooks.sh
@@ -1,24 +1,29 @@
 #!/usr/bin/env sh
-# Install git hooks from .githooks into .git/hooks by creating symlinks (or copying if symlink fails).
+# Install git hooks from kt-kernel/.githooks into the monorepo's .git/hooks by
+# creating symlinks (or copying if symlink fails).
 
 set -eu
 
+# This script lives in kt-kernel/scripts/, so REPO_ROOT = kt-kernel
 REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
-GIT_DIR="$REPO_ROOT/.git"
 HOOKS_SRC="$REPO_ROOT/.githooks"
+
+# Detect the top-level Git worktree (the monorepo root: ktransformers)
+GIT_TOP="$(git rev-parse --show-toplevel 2>/dev/null || true)"
+if [ -z "$GIT_TOP" ] || [ ! -d "$GIT_TOP/.git" ]; then
+  echo "[install-git-hooks] Not inside a git worktree; skipping hooks installation." >&2
+  exit 0
+fi
+
+GIT_DIR="$GIT_TOP/.git"
 HOOKS_DEST="$GIT_DIR/hooks"
 
-if [ ! -d "$GIT_DIR" ]; then
-  echo "Not a git repository (no .git directory) at $REPO_ROOT" >&2
-  exit 1
-fi
-
 if [ ! -d "$HOOKS_SRC" ]; then
-  echo "No .githooks directory found at $HOOKS_SRC" >&2
+  echo "[install-git-hooks] No .githooks directory found at $HOOKS_SRC" >&2
   exit 1
 fi
 
-echo "Installing git hooks from $HOOKS_SRC to $HOOKS_DEST"
+echo "[install-git-hooks] Installing git hooks from $HOOKS_SRC to $HOOKS_DEST (repo: $GIT_TOP)"
 
 # Ensure all source hook files are executable so that even if copied (not symlinked) they run.
 for src_hook in "$HOOKS_SRC"/*; do
@@ -49,4 +54,4 @@ for hook in "$HOOKS_SRC"/*; do
   fi
 done
 
-echo "Done. Hooks installed."
+echo "[install-git-hooks] Done. Hooks installed."