mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-20 06:18:59 +00:00
fix kt-kernel installation issue (#1603)
* update README for kt-kernel for installation issues * update install.sh * update install.sh * update install.sh * update install.sh * update install.sh * update README * fix sudo issue * fix install issue * fix import issue * fix import issue * update install.sh * fix import issue * fix import issue * fix import issue * update README * update install * update install
This commit is contained in:
@@ -31,16 +31,38 @@ git submodule update --init --recursive
|
||||
|
||||
### Quick Installation (Recommended)
|
||||
|
||||
The installation script automatically detects your CPU and configures optimal build settings:
|
||||
Step 0: Create and activate a conda environment (recommended):
|
||||
|
||||
```bash
|
||||
# Simple one-command installation (auto-detects CPU)
|
||||
./install.sh
|
||||
conda create -n kt-kernel python=3.11 -y
|
||||
conda activate kt-kernel
|
||||
```
|
||||
|
||||
The installation script will:
|
||||
You can now install in two clear steps using the same script.
|
||||
|
||||
Option A: Two-step (explicit)
|
||||
|
||||
```bash
|
||||
# 1) Install system prerequisites (cmake, hwloc, pkg-config)
|
||||
./install.sh deps
|
||||
|
||||
# 2) Build and install kt-kernel (auto-detects CPU)
|
||||
# By default, the script cleans the local ./build directory before compiling.
|
||||
./install.sh build
|
||||
```
|
||||
|
||||
Option B: One-step (deps + build)
|
||||
|
||||
```bash
|
||||
# Simple one-command installation
|
||||
./install.sh # same as: ./install.sh all
|
||||
# Skip deps step if you already installed them
|
||||
./install.sh all --skip-deps
|
||||
```
|
||||
|
||||
The install script will:
|
||||
- Auto-detect CPU capabilities (AMX support)
|
||||
- Install `cmake` via conda (for the latest version)
|
||||
- Install `cmake` via conda (if available)
|
||||
- Install system dependencies (`libhwloc-dev`, `pkg-config`) based on your OS
|
||||
|
||||
**What gets configured automatically:**
|
||||
@@ -58,8 +80,8 @@ If you need specific build options (e.g., for LLAMAFILE backend, compatibility,
|
||||
export CPUINFER_CPU_INSTRUCT=AVX512 # Options: NATIVE, AVX512, AVX2
|
||||
export CPUINFER_ENABLE_AMX=OFF # Options: ON, OFF
|
||||
|
||||
# Run with manual mode
|
||||
./install.sh --manual
|
||||
# Run with manual mode (build only)
|
||||
./install.sh build --manual
|
||||
```
|
||||
|
||||
For advanced build options and binary distribution, see the [Build Configuration](#build-configuration) section. If you encounter issues, refer to [Error Troubleshooting](#error-troubleshooting).
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <numa.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <errno.h>
|
||||
|
||||
size_t MemoryRequest::total_size() {
|
||||
size_t total = 0;
|
||||
@@ -53,12 +54,15 @@ void SharedMemBuffer::alloc(void* object, MemoryRequest requests) {
|
||||
if (buffer) {
|
||||
free(buffer);
|
||||
}
|
||||
buffer = std::aligned_alloc(64, total_size);
|
||||
if (!buffer) {
|
||||
printf("cannot aligned alloc %ld bytes\n", total_size);
|
||||
perror("aligned_alloc"); // errno == ENOMEM/EINVAL
|
||||
void* newbuf = nullptr;
|
||||
int rc = posix_memalign(&newbuf, 64, total_size);
|
||||
if (rc != 0 || !newbuf) {
|
||||
errno = rc; // posix_memalign returns error code instead of setting errno
|
||||
printf("cannot aligned alloc %zu bytes (align=%d)\n", (size_t)total_size, 64);
|
||||
perror("posix_memalign"); // ENOMEM/EINVAL
|
||||
exit(1);
|
||||
}
|
||||
buffer = newbuf;
|
||||
size = total_size;
|
||||
for (auto& req : object_requests) {
|
||||
req.update_base_ptr(buffer);
|
||||
|
||||
168
kt-kernel/examples/repro_llamafile_re.py
Normal file
168
kt-kernel/examples/repro_llamafile_re.py
Normal file
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Minimal LLAMAFILE repro harness to catch intermittent RuntimeError/RE.
|
||||
|
||||
Requirements:
|
||||
- kt_kernel_ext built with LLAMAFILE (and CUDA stream integration)
|
||||
- Valid GGUF weights directory (WEIGHT_PATH)
|
||||
|
||||
Usage:
|
||||
WEIGHT_PATH=/path/to/gguf python examples/repro_llamafile_re.py
|
||||
|
||||
Optional env:
|
||||
DEVICE=cuda|cpu # default: auto (cuda if available)
|
||||
N_ITERS=1000 # iterations
|
||||
BATCH=4 # batch size
|
||||
H=2048 # hidden size
|
||||
EXPERTS=128 # total experts
|
||||
TOPK=8 # experts per token
|
||||
INTER=768 # intermediate size (must be divisible by 256)
|
||||
GPU_EXPERTS=100 # num experts on GPU side
|
||||
TP=2 # threadpool_count
|
||||
CPU_THREADS=32 # cpuinfer_threads
|
||||
MAX_DEFER=2 # max_deferred_experts_per_token
|
||||
MODE=split|forward # split=submit+sync, forward=wrapper.forward
|
||||
SEED=1 # random seed
|
||||
|
||||
Debug tips:
|
||||
- Set CUDA_LAUNCH_BLOCKING=1 to catch async errors deterministically.
|
||||
- Try varying N_ITERS, BATCH, TOPK, MAX_DEFER.
|
||||
- Capture stdout/stderr for failure iteration index.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import faulthandler
|
||||
import torch
|
||||
|
||||
from kt_kernel import KTMoEWrapper
|
||||
|
||||
|
||||
def getenv_int(name: str, default: int) -> int:
|
||||
try:
|
||||
return int(os.environ.get(name, default))
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
def get_stream_for(device: torch.device | str):
|
||||
device = torch.device(device)
|
||||
if device.type == "cuda" and torch.cuda.is_available():
|
||||
return torch.cuda.current_stream(device).cuda_stream
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
faulthandler.enable()
|
||||
|
||||
weight_path = (os.environ.get("WEIGHT_PATH") or "").strip()
|
||||
if not weight_path:
|
||||
print("ERROR: WEIGHT_PATH env is required.")
|
||||
return 2
|
||||
if not os.path.exists(weight_path):
|
||||
print(f"ERROR: WEIGHT_PATH does not exist: {weight_path}")
|
||||
return 2
|
||||
|
||||
device_str = os.environ.get("DEVICE") or ("cuda" if torch.cuda.is_available() else "cpu")
|
||||
device = torch.device(device_str)
|
||||
|
||||
n_iters = getenv_int("N_ITERS", 1000)
|
||||
batch = getenv_int("BATCH", 4)
|
||||
hidden = getenv_int("H", 2048)
|
||||
experts = getenv_int("EXPERTS", 128)
|
||||
topk = getenv_int("TOPK", 8)
|
||||
inter = getenv_int("INTER", 768)
|
||||
gpu_experts = getenv_int("GPU_EXPERTS", 100)
|
||||
tp = getenv_int("TP", 2)
|
||||
cpu_threads = getenv_int("CPU_THREADS", 32)
|
||||
max_defer = getenv_int("MAX_DEFER", 2)
|
||||
seed = getenv_int("SEED", 1)
|
||||
mode = (os.environ.get("MODE") or "split").lower()
|
||||
|
||||
if inter % 256 != 0:
|
||||
print(f"ERROR: INTER must be divisible by 256 for LLAMAFILE (got {inter}).")
|
||||
return 2
|
||||
|
||||
print(
|
||||
f"LLAMAFILE Repro: device={device}, iters={n_iters}, batch={batch}, H={hidden}, topk={topk}, E={experts}, inter={inter}, TP={tp}, CPU_THREADS={cpu_threads}, mode={mode}"
|
||||
)
|
||||
print(f"Weights: {weight_path}")
|
||||
|
||||
torch.manual_seed(seed)
|
||||
|
||||
# Create wrapper and load weights once
|
||||
wrapper = KTMoEWrapper(
|
||||
layer_idx=0,
|
||||
num_experts=experts,
|
||||
num_experts_per_tok=topk,
|
||||
hidden_size=hidden,
|
||||
moe_intermediate_size=inter,
|
||||
num_gpu_experts=gpu_experts,
|
||||
cpuinfer_threads=cpu_threads,
|
||||
threadpool_count=tp,
|
||||
weight_path=weight_path,
|
||||
chunked_prefill_size=512,
|
||||
method="LLAMAFILE",
|
||||
max_deferred_experts_per_token=max_defer,
|
||||
)
|
||||
wrapper.load_weights()
|
||||
|
||||
# Optional capture of small batch sizes
|
||||
KTMoEWrapper.set_capture_batch_sizes([1, 2, 4, 8, 16])
|
||||
|
||||
stream = get_stream_for(device)
|
||||
|
||||
# Allocate once and reuse to reduce allocator noise
|
||||
hidden_states = torch.empty(batch, hidden, dtype=torch.bfloat16, device=device)
|
||||
topk_ids = torch.empty(batch, topk, dtype=torch.long, device=device)
|
||||
topk_weights = torch.empty(batch, topk, dtype=torch.float32, device=device)
|
||||
|
||||
def fill_random():
|
||||
hidden_states.normal_(mean=0.0, std=1.0)
|
||||
topk_ids.random_(0, experts)
|
||||
topk_weights.uniform_()
|
||||
topk_weights.div_(topk_weights.sum(dim=-1, keepdim=True) + 1e-6)
|
||||
|
||||
# Warmup
|
||||
fill_random()
|
||||
_ = wrapper.forward(hidden_states, topk_ids, topk_weights, stream)
|
||||
if device.type == "cuda":
|
||||
torch.cuda.synchronize(device)
|
||||
|
||||
# Main loop
|
||||
for i in range(n_iters):
|
||||
try:
|
||||
fill_random()
|
||||
if mode == "forward":
|
||||
_ = wrapper.forward(hidden_states, topk_ids, topk_weights, stream)
|
||||
else:
|
||||
wrapper.submit_forward(hidden_states, topk_ids, topk_weights, stream)
|
||||
# Optional small GPU op to put work on the same stream
|
||||
if device.type == "cuda":
|
||||
hidden_states.add_(0) # no-op but enqueued on current stream
|
||||
_ = wrapper.sync_forward(hidden_states, stream)
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f"ok: iter {i + 1}/{n_iters}")
|
||||
if device.type == "cuda":
|
||||
torch.cuda.synchronize(device)
|
||||
|
||||
except Exception as e:
|
||||
print(f"FAIL at iter {i}: {repr(e)}")
|
||||
# Flush GPU work for better diagnostics
|
||||
if device.type == "cuda":
|
||||
try:
|
||||
torch.cuda.synchronize(device)
|
||||
except Exception as _:
|
||||
pass
|
||||
return 1
|
||||
|
||||
print("All iterations completed without error.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
set -euo pipefail
|
||||
|
||||
install_dependencies() {
|
||||
echo "Checking and installing system dependencies..."
|
||||
@@ -65,14 +65,21 @@ install_dependencies
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [OPTIONS]
|
||||
Usage: $0 [SUBCOMMAND] [BUILD_OPTIONS]
|
||||
|
||||
This script builds kt-kernel with optimal settings for your CPU.
|
||||
Two-step installation in one file. Choose a subcommand:
|
||||
|
||||
OPTIONS:
|
||||
(none) Auto-detect CPU and configure automatically (recommended)
|
||||
SUBCOMMANDS:
|
||||
deps Install system prerequisites only
|
||||
build Build and install kt-kernel (no dependency install)
|
||||
all Run deps then build (default when no subcommand)
|
||||
-h, --help Show this help message
|
||||
|
||||
BUILD_OPTIONS (for "build" or "all"):
|
||||
(none) Auto-detect CPU and configure automatically (recommended)
|
||||
--manual Skip auto-detection, use manual configuration (see below)
|
||||
--skip-deps Skip deps step even with subcommand "all"
|
||||
--no-clean Do not delete local build/ before building (default cleans)
|
||||
|
||||
AUTO-DETECTION (Default):
|
||||
The script will automatically detect your CPU capabilities and configure:
|
||||
@@ -115,6 +122,66 @@ EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
install_dependencies() {
|
||||
echo "Checking and installing system dependencies..."
|
||||
|
||||
# Determine if we need to use sudo
|
||||
SUDO=""
|
||||
if [ "${EUID:-0}" -ne 0 ]; then
|
||||
if command -v sudo &> /dev/null; then
|
||||
SUDO="sudo"
|
||||
else
|
||||
echo "Warning: Not running as root and sudo not found. Package installation may fail."
|
||||
echo "Please run as root or install sudo."
|
||||
fi
|
||||
fi
|
||||
|
||||
if command -v conda &> /dev/null; then
|
||||
echo "Installing cmake via conda..."
|
||||
conda install -y cmake
|
||||
else
|
||||
echo "Warning: conda not found. Skipping cmake installation via conda."
|
||||
echo "Please install conda or manually install cmake."
|
||||
fi
|
||||
|
||||
# Detect OS type
|
||||
if [ -f /etc/os-release ]; then
|
||||
. /etc/os-release
|
||||
OS=$ID
|
||||
elif [ -f /etc/debian_version ]; then
|
||||
OS="debian"
|
||||
elif [ -f /etc/redhat-release ]; then
|
||||
OS="rhel"
|
||||
else
|
||||
echo "Warning: Unable to detect OS type. Skipping dependency installation."
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Install dependencies based on OS
|
||||
case "$OS" in
|
||||
debian|ubuntu|linuxmint|pop)
|
||||
echo "Detected Debian-based system. Installing libhwloc-dev and pkg-config..."
|
||||
$SUDO apt update
|
||||
$SUDO apt install -y libhwloc-dev pkg-config
|
||||
;;
|
||||
fedora|rhel|centos|rocky|almalinux)
|
||||
echo "Detected Red Hat-based system. Installing hwloc-devel and pkgconfig..."
|
||||
$SUDO dnf install -y hwloc-devel pkgconfig || $SUDO yum install -y hwloc-devel pkgconfig
|
||||
;;
|
||||
arch|manjaro)
|
||||
echo "Detected Arch-based system. Installing hwloc and pkgconf..."
|
||||
$SUDO pacman -S --noconfirm hwloc pkgconf
|
||||
;;
|
||||
opensuse*|sles)
|
||||
echo "Detected openSUSE-based system. Installing hwloc-devel and pkg-config..."
|
||||
$SUDO zypper install -y hwloc-devel pkg-config
|
||||
;;
|
||||
*)
|
||||
echo "Warning: Unsupported OS '$OS'. Please manually install libhwloc-dev and pkg-config."
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Function to detect CPU features
|
||||
detect_cpu_features() {
|
||||
local has_amx=0
|
||||
@@ -132,18 +199,33 @@ detect_cpu_features() {
|
||||
echo "$has_amx"
|
||||
}
|
||||
|
||||
# Check if user requested help
|
||||
if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
|
||||
usage
|
||||
fi
|
||||
build_step() {
|
||||
# Parse build-only flags from arguments to this function
|
||||
local MANUAL_MODE=0
|
||||
local CLEAN_BUILD=1
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--manual) MANUAL_MODE=1; shift ;;
|
||||
--skip-deps) shift ;; # ignore here
|
||||
--no-clean) CLEAN_BUILD=0; shift ;;
|
||||
-h|--help) usage ;;
|
||||
*) break ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if manual mode
|
||||
MANUAL_MODE=0
|
||||
if [ "$1" = "--manual" ]; then
|
||||
MANUAL_MODE=1
|
||||
fi
|
||||
# Clean local build directory to ensure a fresh CMake/configure
|
||||
local REPO_ROOT
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
if [[ "$CLEAN_BUILD" -eq 1 ]]; then
|
||||
if [[ -d "$REPO_ROOT/build" ]]; then
|
||||
echo "Cleaning previous build directory: $REPO_ROOT/build"
|
||||
rm -rf "$REPO_ROOT/build"
|
||||
fi
|
||||
else
|
||||
echo "Skipping clean of $REPO_ROOT/build (requested by --no-clean)"
|
||||
fi
|
||||
|
||||
if [ "$MANUAL_MODE" = "0" ]; then
|
||||
if [ "$MANUAL_MODE" = "0" ]; then
|
||||
# Auto-detection mode
|
||||
echo "=========================================="
|
||||
echo "Auto-detecting CPU capabilities..."
|
||||
@@ -172,7 +254,7 @@ if [ "$MANUAL_MODE" = "0" ]; then
|
||||
echo ""
|
||||
echo "To use manual configuration instead, run: $0 --manual"
|
||||
echo ""
|
||||
else
|
||||
else
|
||||
# Manual mode - validate user configuration (no exports)
|
||||
if [ -z "$CPUINFER_CPU_INSTRUCT" ] || [ -z "$CPUINFER_ENABLE_AMX" ]; then
|
||||
echo "Error: Manual mode requires CPUINFER_CPU_INSTRUCT and CPUINFER_ENABLE_AMX to be set."
|
||||
@@ -216,7 +298,9 @@ else
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Close MANUAL_MODE conditional
|
||||
fi
|
||||
|
||||
# Set defaults for optional variables
|
||||
export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
|
||||
@@ -232,9 +316,31 @@ echo " CPUINFER_VERBOSE=$CPUINFER_VERBOSE"
|
||||
echo ""
|
||||
|
||||
pip install . -v
|
||||
}
|
||||
|
||||
# Subcommand dispatcher: default to "all"
|
||||
SUBCMD="all"
|
||||
if [[ $# -gt 0 ]]; then
|
||||
case "$1" in
|
||||
deps|build|all) SUBCMD="$1"; shift ;;
|
||||
-h|--help) usage ;;
|
||||
*) SUBCMD="build" ;; # backward compatibility: flags-only => build
|
||||
esac
|
||||
fi
|
||||
|
||||
echo "Successfully built and installed kt-kernel! with configuration:"
|
||||
echo " CPUINFER_CPU_INSTRUCT=$CPUINFER_CPU_INSTRUCT"
|
||||
echo " CPUINFER_ENABLE_AMX=$CPUINFER_ENABLE_AMX"
|
||||
echo " CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
|
||||
case "$SUBCMD" in
|
||||
deps)
|
||||
install_dependencies
|
||||
;;
|
||||
build)
|
||||
build_step "$@"
|
||||
;;
|
||||
all)
|
||||
if [[ " ${*:-} " == *" --skip-deps "* ]]; then
|
||||
build_step "$@"
|
||||
else
|
||||
install_dependencies
|
||||
build_step "$@"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
Reference in New Issue
Block a user