Kt minimax (#1742)

[feat]: fp8 kernel and kt-cli support
This commit is contained in:
ErvinXie
2025-12-24 15:39:44 +08:00
committed by GitHub
parent e7d277d163
commit d8046e1bb4
65 changed files with 12111 additions and 2502 deletions

View File

@@ -37,11 +37,13 @@ from __future__ import annotations
# Detect CPU and load optimal extension variant
from ._cpu_detect import initialize as _initialize_cpu
_kt_kernel_ext, __cpu_variant__ = _initialize_cpu()
# Make the extension module available to other modules in this package
import sys
sys.modules['kt_kernel_ext'] = _kt_kernel_ext
sys.modules["kt_kernel_ext"] = _kt_kernel_ext
# Also expose kt_kernel_ext as an attribute for backward compatibility
kt_kernel_ext = _kt_kernel_ext
@@ -53,25 +55,28 @@ from .experts import KTMoEWrapper
try:
# Try to get version from installed package metadata (works in installed environment)
from importlib.metadata import version, PackageNotFoundError
try:
__version__ = version('kt-kernel')
__version__ = version("kt-kernel")
except PackageNotFoundError:
# Package not installed, try to read from source tree version.py
import os
_root_version_file = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'version.py')
_root_version_file = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "version.py")
if os.path.exists(_root_version_file):
_version_ns = {}
with open(_root_version_file, 'r', encoding='utf-8') as f:
with open(_root_version_file, "r", encoding="utf-8") as f:
exec(f.read(), _version_ns)
__version__ = _version_ns.get('__version__', '0.4.3')
__version__ = _version_ns.get("__version__", "0.4.3")
else:
__version__ = "0.4.3"
except ImportError:
# Python < 3.8, fallback to pkg_resources or hardcoded version
try:
from pkg_resources import get_distribution, DistributionNotFound
try:
__version__ = get_distribution('kt-kernel').version
__version__ = get_distribution("kt-kernel").version
except DistributionNotFound:
__version__ = "0.4.3"
except ImportError:

View File

@@ -17,6 +17,7 @@ Example:
>>> os.environ['KT_KERNEL_CPU_VARIANT'] = 'avx2'
>>> import kt_kernel # Will use AVX2 variant
"""
import os
import sys
from pathlib import Path
@@ -35,82 +36,82 @@ def detect_cpu_features():
str: 'amx', 'avx512', or 'avx2'
"""
# Check environment override
variant = os.environ.get('KT_KERNEL_CPU_VARIANT', '').lower()
if variant in ['amx', 'avx512', 'avx2']:
if os.environ.get('KT_KERNEL_DEBUG') == '1':
variant = os.environ.get("KT_KERNEL_CPU_VARIANT", "").lower()
if variant in ["amx", "avx512", "avx2"]:
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print(f"[kt-kernel] Using environment override: {variant}")
return variant
# Try to read /proc/cpuinfo on Linux
try:
with open('/proc/cpuinfo', 'r') as f:
with open("/proc/cpuinfo", "r") as f:
cpuinfo = f.read().lower()
# Check for AMX support (Intel Sapphire Rapids+)
# AMX requires amx_tile, amx_int8, and amx_bf16
amx_flags = ['amx_tile', 'amx_int8', 'amx_bf16']
amx_flags = ["amx_tile", "amx_int8", "amx_bf16"]
has_amx = all(flag in cpuinfo for flag in amx_flags)
if has_amx:
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] Detected AMX support via /proc/cpuinfo")
return 'amx'
return "amx"
# Check for AVX512 support
# AVX512F is the foundation for all AVX512 variants
if 'avx512f' in cpuinfo:
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if "avx512f" in cpuinfo:
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] Detected AVX512 support via /proc/cpuinfo")
return 'avx512'
return "avx512"
# Check for AVX2 support
if 'avx2' in cpuinfo:
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if "avx2" in cpuinfo:
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] Detected AVX2 support via /proc/cpuinfo")
return 'avx2'
return "avx2"
# Fallback to AVX2 (should be rare on modern CPUs)
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] No AVX2/AVX512/AMX detected, using AVX2 fallback")
return 'avx2'
return "avx2"
except FileNotFoundError:
# /proc/cpuinfo doesn't exist (not Linux or in container)
# Try cpufeature package as fallback
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] /proc/cpuinfo not found, trying cpufeature package")
try:
import cpufeature
# Check for AMX
if cpufeature.CPUFeature.get('AMX_TILE', False):
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if cpufeature.CPUFeature.get("AMX_TILE", False):
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] Detected AMX support via cpufeature")
return 'amx'
return "amx"
# Check for AVX512
if cpufeature.CPUFeature.get('AVX512F', False):
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if cpufeature.CPUFeature.get("AVX512F", False):
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] Detected AVX512 support via cpufeature")
return 'avx512'
return "avx512"
# Fallback to AVX2
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] Using AVX2 fallback via cpufeature")
return 'avx2'
return "avx2"
except ImportError:
# cpufeature not available - ultimate fallback
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] cpufeature not available, using AVX2 fallback")
return 'avx2'
return "avx2"
except Exception as e:
# Any other error - safe fallback
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print(f"[kt-kernel] Error during CPU detection: {e}, using AVX2 fallback")
return 'avx2'
return "avx2"
def load_extension(variant):
@@ -148,51 +149,53 @@ def load_extension(variant):
kt_kernel_dir = os.path.dirname(os.path.abspath(__file__))
# Try multi-variant naming first
pattern = os.path.join(kt_kernel_dir, f'_kt_kernel_ext_{variant}.*.so')
pattern = os.path.join(kt_kernel_dir, f"_kt_kernel_ext_{variant}.*.so")
so_files = glob.glob(pattern)
if not so_files:
# Try single-variant naming (fallback for builds without CPUINFER_BUILD_ALL_VARIANTS)
pattern = os.path.join(kt_kernel_dir, 'kt_kernel_ext.*.so')
pattern = os.path.join(kt_kernel_dir, "kt_kernel_ext.*.so")
so_files = glob.glob(pattern)
if so_files:
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print(f"[kt-kernel] Multi-variant {variant} not found, using single-variant build")
else:
raise ImportError(f"No .so file found for variant {variant} (tried patterns: {kt_kernel_dir}/_kt_kernel_ext_{variant}.*.so and {kt_kernel_dir}/kt_kernel_ext.*.so)")
raise ImportError(
f"No .so file found for variant {variant} (tried patterns: {kt_kernel_dir}/_kt_kernel_ext_{variant}.*.so and {kt_kernel_dir}/kt_kernel_ext.*.so)"
)
so_file = so_files[0]
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print(f"[kt-kernel] Loading {variant} from: {so_file}")
# Load the module manually
# The module exports PyInit_kt_kernel_ext, so we use that as the module name
spec = importlib.util.spec_from_file_location('kt_kernel_ext', so_file)
spec = importlib.util.spec_from_file_location("kt_kernel_ext", so_file)
if spec is None or spec.loader is None:
raise ImportError(f"Failed to create spec for {so_file}")
ext = importlib.util.module_from_spec(spec)
spec.loader.exec_module(ext)
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print(f"[kt-kernel] Successfully loaded {variant.upper()} variant")
return ext
except (ImportError, ModuleNotFoundError, FileNotFoundError) as e:
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print(f"[kt-kernel] Failed to load {variant} variant: {e}")
# Automatic fallback to next best variant
if variant == 'amx':
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if variant == "amx":
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] Falling back from AMX to AVX512")
return load_extension('avx512')
elif variant == 'avx512':
if os.environ.get('KT_KERNEL_DEBUG') == '1':
return load_extension("avx512")
elif variant == "avx512":
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print("[kt-kernel] Falling back from AVX512 to AVX2")
return load_extension('avx2')
return load_extension("avx2")
else:
# AVX2 is the last fallback - if this fails, we can't continue
raise ImportError(
@@ -221,13 +224,13 @@ def initialize():
# Detect CPU features
variant = detect_cpu_features()
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print(f"[kt-kernel] Selected CPU variant: {variant}")
# Load the appropriate extension
ext = load_extension(variant)
if os.environ.get('KT_KERNEL_DEBUG') == '1':
if os.environ.get("KT_KERNEL_DEBUG") == "1":
print(f"[kt-kernel] Extension module loaded: {ext.__name__}")
return ext, variant

View File

@@ -0,0 +1,8 @@
"""
KTransformers CLI - A unified command-line interface for KTransformers.
This CLI provides a user-friendly interface to all KTransformers functionality,
including model inference, fine-tuning, benchmarking, and more.
"""
__version__ = "0.1.0"

View File

@@ -0,0 +1,3 @@
"""
Command modules for kt-cli.
"""

View File

@@ -0,0 +1,274 @@
"""
Bench commands for kt-cli.
Runs benchmarks for performance testing.
"""
import subprocess
import sys
from enum import Enum
from pathlib import Path
from typing import Optional
import typer
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import (
console,
print_error,
print_info,
print_step,
print_success,
)
class BenchType(str, Enum):
"""Benchmark type."""
INFERENCE = "inference"
MLA = "mla"
MOE = "moe"
LINEAR = "linear"
ATTENTION = "attention"
ALL = "all"
def bench(
type: BenchType = typer.Option(
BenchType.ALL,
"--type",
"-t",
help="Benchmark type",
),
model: Optional[str] = typer.Option(
None,
"--model",
"-m",
help="Model to benchmark",
),
output: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Output file for results (JSON)",
),
iterations: int = typer.Option(
10,
"--iterations",
"-n",
help="Number of iterations",
),
) -> None:
"""Run full benchmark suite."""
console.print()
print_step(t("bench_starting"))
print_info(t("bench_type", type=type.value))
console.print()
if type == BenchType.ALL:
_run_all_benchmarks(model, output, iterations)
elif type == BenchType.INFERENCE:
_run_inference_benchmark(model, output, iterations)
elif type == BenchType.MLA:
_run_component_benchmark("mla", output, iterations)
elif type == BenchType.MOE:
_run_component_benchmark("moe", output, iterations)
elif type == BenchType.LINEAR:
_run_component_benchmark("linear", output, iterations)
elif type == BenchType.ATTENTION:
_run_component_benchmark("attention", output, iterations)
console.print()
print_success(t("bench_complete"))
if output:
console.print(f" Results saved to: {output}")
console.print()
def microbench(
component: str = typer.Argument(
"moe",
help="Component to benchmark (moe, mla, linear, attention)",
),
batch_size: int = typer.Option(
1,
"--batch-size",
"-b",
help="Batch size",
),
seq_len: int = typer.Option(
1,
"--seq-len",
"-s",
help="Sequence length",
),
iterations: int = typer.Option(
100,
"--iterations",
"-n",
help="Number of iterations",
),
warmup: int = typer.Option(
10,
"--warmup",
"-w",
help="Warmup iterations",
),
output: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Output file for results (JSON)",
),
) -> None:
"""Run micro-benchmark for specific components."""
console.print()
console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
console.print()
raise typer.Exit(0)
# Try to find the benchmark script
kt_kernel_path = _find_kt_kernel_path()
if kt_kernel_path is None:
print_error("kt-kernel not found. Install with: kt install inference")
raise typer.Exit(1)
bench_dir = kt_kernel_path / "bench"
# Map component to script
component_scripts = {
"moe": "bench_moe.py",
"mla": "bench_mla.py",
"linear": "bench_linear.py",
"attention": "bench_attention.py",
"mlp": "bench_mlp.py",
}
script_name = component_scripts.get(component.lower())
if script_name is None:
print_error(f"Unknown component: {component}")
console.print(f"Available: {', '.join(component_scripts.keys())}")
raise typer.Exit(1)
script_path = bench_dir / script_name
if not script_path.exists():
print_error(f"Benchmark script not found: {script_path}")
raise typer.Exit(1)
# Run benchmark
cmd = [
sys.executable,
str(script_path),
"--batch-size",
str(batch_size),
"--seq-len",
str(seq_len),
"--iterations",
str(iterations),
"--warmup",
str(warmup),
]
if output:
cmd.extend(["--output", str(output)])
console.print(f"[dim]$ {' '.join(cmd)}[/dim]")
console.print()
try:
process = subprocess.run(cmd)
if process.returncode == 0:
console.print()
print_success(t("bench_complete"))
if output:
console.print(f" Results saved to: {output}")
else:
print_error(f"Benchmark failed with exit code {process.returncode}")
raise typer.Exit(process.returncode)
except FileNotFoundError as e:
print_error(f"Failed to run benchmark: {e}")
raise typer.Exit(1)
def _find_kt_kernel_path() -> Optional[Path]:
"""Find the kt-kernel installation path."""
try:
import kt_kernel
return Path(kt_kernel.__file__).parent.parent
except ImportError:
pass
# Check common locations
possible_paths = [
Path.home() / "Projects" / "ktransformers" / "kt-kernel",
Path("/opt/ktransformers/kt-kernel"),
Path.cwd() / "kt-kernel",
]
for path in possible_paths:
if path.exists() and (path / "bench").exists():
return path
return None
def _run_all_benchmarks(model: Optional[str], output: Optional[Path], iterations: int) -> None:
"""Run all benchmarks."""
components = ["moe", "mla", "linear", "attention"]
for component in components:
console.print(f"\n[bold]Running {component} benchmark...[/bold]")
_run_component_benchmark(component, None, iterations)
def _run_inference_benchmark(model: Optional[str], output: Optional[Path], iterations: int) -> None:
"""Run inference benchmark."""
if model is None:
print_error("Model required for inference benchmark. Use --model flag.")
raise typer.Exit(1)
print_info(f"Running inference benchmark on {model}...")
console.print()
console.print("[dim]This will start the server and run test requests.[/dim]")
console.print()
# TODO: Implement actual inference benchmarking
print_error("Inference benchmarking not yet implemented.")
def _run_component_benchmark(component: str, output: Optional[Path], iterations: int) -> None:
"""Run a component benchmark."""
kt_kernel_path = _find_kt_kernel_path()
if kt_kernel_path is None:
print_error("kt-kernel not found.")
return
bench_dir = kt_kernel_path / "bench"
script_map = {
"moe": "bench_moe.py",
"mla": "bench_mla.py",
"linear": "bench_linear.py",
"attention": "bench_attention.py",
}
script_name = script_map.get(component)
if script_name is None:
print_error(f"Unknown component: {component}")
return
script_path = bench_dir / script_name
if not script_path.exists():
print_error(f"Script not found: {script_path}")
return
cmd = [sys.executable, str(script_path), "--iterations", str(iterations)]
try:
subprocess.run(cmd)
except Exception as e:
print_error(f"Benchmark failed: {e}")

View File

@@ -0,0 +1,437 @@
"""
Chat command for kt-cli.
Provides interactive chat interface with running model server.
"""
import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from rich.console import Console
from rich.markdown import Markdown
from rich.panel import Panel
from rich.prompt import Prompt, Confirm
from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import (
console,
print_error,
print_info,
print_success,
print_warning,
)
# Try to import OpenAI SDK
try:
from openai import OpenAI
HAS_OPENAI = True
except ImportError:
HAS_OPENAI = False
def chat(
host: Optional[str] = typer.Option(
None,
"--host",
"-H",
help="Server host address",
),
port: Optional[int] = typer.Option(
None,
"--port",
"-p",
help="Server port",
),
model: Optional[str] = typer.Option(
None,
"--model",
"-m",
help="Model name (if server hosts multiple models)",
),
temperature: float = typer.Option(
0.7,
"--temperature",
"-t",
help="Sampling temperature (0.0 to 2.0)",
),
max_tokens: int = typer.Option(
2048,
"--max-tokens",
help="Maximum tokens to generate",
),
system_prompt: Optional[str] = typer.Option(
None,
"--system",
"-s",
help="System prompt",
),
save_history: bool = typer.Option(
True,
"--save-history/--no-save-history",
help="Save conversation history",
),
history_file: Optional[Path] = typer.Option(
None,
"--history-file",
help="Path to save conversation history",
),
stream: bool = typer.Option(
True,
"--stream/--no-stream",
help="Enable streaming output",
),
) -> None:
"""Start interactive chat with a running model server.
Examples:
kt chat # Connect to default server
kt chat --host 127.0.0.1 -p 8080 # Connect to specific server
kt chat -t 0.9 --max-tokens 4096 # Adjust generation parameters
"""
if not HAS_OPENAI:
print_error("OpenAI Python SDK is required for chat functionality.")
console.print()
console.print("Install it with:")
console.print(" pip install openai")
raise typer.Exit(1)
settings = get_settings()
# Resolve server connection
final_host = host or settings.get("server.host", "127.0.0.1")
final_port = port or settings.get("server.port", 30000)
# Construct base URL for OpenAI-compatible API
base_url = f"http://{final_host}:{final_port}/v1"
console.print()
console.print(
Panel.fit(
f"[bold cyan]KTransformers Chat[/bold cyan]\n\n"
f"Server: [yellow]{final_host}:{final_port}[/yellow]\n"
f"Temperature: [cyan]{temperature}[/cyan] | Max tokens: [cyan]{max_tokens}[/cyan]\n\n"
f"[dim]Type '/help' for commands, '/quit' to exit[/dim]",
border_style="cyan",
)
)
console.print()
# Check for proxy environment variables
proxy_vars = ["HTTP_PROXY", "HTTPS_PROXY", "http_proxy", "https_proxy", "ALL_PROXY", "all_proxy"]
detected_proxies = {var: os.environ.get(var) for var in proxy_vars if os.environ.get(var)}
if detected_proxies:
proxy_info = ", ".join(f"{k}={v}" for k, v in detected_proxies.items())
console.print()
print_warning(t("chat_proxy_detected"))
console.print(f" [dim]{proxy_info}[/dim]")
console.print()
use_proxy = Confirm.ask(t("chat_proxy_confirm"), default=False)
if not use_proxy:
# Temporarily disable proxy for this connection
for var in proxy_vars:
if var in os.environ:
del os.environ[var]
print_info(t("chat_proxy_disabled"))
console.print()
# Initialize OpenAI client
try:
client = OpenAI(
base_url=base_url,
api_key="EMPTY", # SGLang doesn't require API key
)
# Test connection
print_info("Connecting to server...")
models = client.models.list()
available_models = [m.id for m in models.data]
if not available_models:
print_error("No models available on server")
raise typer.Exit(1)
# Select model
if model:
if model not in available_models:
print_warning(f"Model '{model}' not found. Available models: {', '.join(available_models)}")
selected_model = available_models[0]
else:
selected_model = model
else:
selected_model = available_models[0]
print_success(f"Connected to model: {selected_model}")
console.print()
except Exception as e:
print_error(f"Failed to connect to server: {e}")
console.print()
console.print("Make sure the model server is running:")
console.print(" kt run <model>")
raise typer.Exit(1)
# Initialize conversation history
messages = []
# Add system prompt if provided
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
# Setup history file
if save_history:
if history_file is None:
history_dir = settings.config_dir / "chat_history"
history_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
history_file = history_dir / f"chat_{timestamp}.json"
else:
history_file = Path(history_file)
history_file.parent.mkdir(parents=True, exist_ok=True)
# Main chat loop
try:
while True:
# Get user input
try:
user_input = Prompt.ask("[bold green]You[/bold green]")
except (EOFError, KeyboardInterrupt):
console.print()
print_info("Goodbye!")
break
if not user_input.strip():
continue
# Handle special commands
if user_input.startswith("/"):
if _handle_command(user_input, messages, temperature, max_tokens):
continue
else:
break # Exit command
# Add user message to history
messages.append({"role": "user", "content": user_input})
# Generate response
console.print()
console.print("[bold cyan]Assistant[/bold cyan]")
try:
if stream:
# Streaming response
response_content = _stream_response(client, selected_model, messages, temperature, max_tokens)
else:
# Non-streaming response
response_content = _generate_response(client, selected_model, messages, temperature, max_tokens)
# Add assistant response to history
messages.append({"role": "assistant", "content": response_content})
console.print()
except Exception as e:
print_error(f"Error generating response: {e}")
# Remove the user message that caused the error
messages.pop()
continue
# Save history if enabled
if save_history:
_save_history(history_file, messages, selected_model)
except KeyboardInterrupt:
console.print()
console.print()
print_info("Chat interrupted. Goodbye!")
# Final history save
if save_history and messages:
_save_history(history_file, messages, selected_model)
console.print(f"[dim]History saved to: {history_file}[/dim]")
console.print()
def _stream_response(
client: "OpenAI",
model: str,
messages: list,
temperature: float,
max_tokens: int,
) -> str:
"""Generate streaming response and display in real-time."""
response_content = ""
try:
stream = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=True,
)
for chunk in stream:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
response_content += content
console.print(content, end="")
console.print() # Newline after streaming
except Exception as e:
raise Exception(f"Streaming error: {e}")
return response_content
def _generate_response(
client: "OpenAI",
model: str,
messages: list,
temperature: float,
max_tokens: int,
) -> str:
"""Generate non-streaming response."""
try:
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=False,
)
content = response.choices[0].message.content
# Display as markdown
md = Markdown(content)
console.print(md)
return content
except Exception as e:
raise Exception(f"Generation error: {e}")
def _handle_command(command: str, messages: list, temperature: float, max_tokens: int) -> bool:
"""Handle special commands. Returns True to continue chat, False to exit."""
cmd = command.lower().strip()
if cmd in ["/quit", "/exit", "/q"]:
console.print()
print_info("Goodbye!")
return False
elif cmd in ["/help", "/h"]:
console.print()
console.print(
Panel(
"[bold]Available Commands:[/bold]\n\n"
"/help, /h - Show this help message\n"
"/quit, /exit, /q - Exit chat\n"
"/clear, /c - Clear conversation history\n"
"/history, /hist - Show conversation history\n"
"/info, /i - Show current settings\n"
"/retry, /r - Regenerate last response",
title="Help",
border_style="cyan",
)
)
console.print()
return True
elif cmd in ["/clear", "/c"]:
messages.clear()
console.print()
print_success("Conversation history cleared")
console.print()
return True
elif cmd in ["/history", "/hist"]:
console.print()
if not messages:
print_info("No conversation history")
else:
console.print(
Panel(
_format_history(messages),
title=f"History ({len(messages)} messages)",
border_style="cyan",
)
)
console.print()
return True
elif cmd in ["/info", "/i"]:
console.print()
console.print(
Panel(
f"[bold]Current Settings:[/bold]\n\n"
f"Temperature: [cyan]{temperature}[/cyan]\n"
f"Max tokens: [cyan]{max_tokens}[/cyan]\n"
f"Messages: [cyan]{len(messages)}[/cyan]",
title="Info",
border_style="cyan",
)
)
console.print()
return True
elif cmd in ["/retry", "/r"]:
if len(messages) >= 2 and messages[-1]["role"] == "assistant":
# Remove last assistant response
messages.pop()
print_info("Retrying last response...")
console.print()
else:
print_warning("No previous response to retry")
console.print()
return True
else:
print_warning(f"Unknown command: {command}")
console.print("[dim]Type /help for available commands[/dim]")
console.print()
return True
def _format_history(messages: list) -> str:
"""Format conversation history for display."""
lines = []
for i, msg in enumerate(messages, 1):
role = msg["role"].capitalize()
content = msg["content"]
# Truncate long messages
if len(content) > 200:
content = content[:200] + "..."
lines.append(f"[bold]{i}. {role}:[/bold] {content}")
return "\n\n".join(lines)
def _save_history(file_path: Path, messages: list, model: str) -> None:
"""Save conversation history to file."""
try:
history_data = {
"model": model,
"timestamp": datetime.now().isoformat(),
"messages": messages,
}
with open(file_path, "w", encoding="utf-8") as f:
json.dump(history_data, f, indent=2, ensure_ascii=False)
except Exception as e:
print_warning(f"Failed to save history: {e}")

View File

@@ -0,0 +1,167 @@
"""
Config command for kt-cli.
Manages kt-cli configuration.
"""
from typing import Optional
import typer
import yaml
from rich.syntax import Syntax
from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import confirm, console, print_error, print_success
app = typer.Typer(help="Manage kt-cli configuration")
@app.command(name="init")
def init() -> None:
"""Initialize or re-run the first-time setup wizard."""
from kt_kernel.cli.main import _show_first_run_setup
from kt_kernel.cli.config.settings import get_settings
settings = get_settings()
_show_first_run_setup(settings)
@app.command(name="show")
def show(
key: Optional[str] = typer.Argument(None, help="Configuration key to show (e.g., server.port)"),
) -> None:
"""Show current configuration."""
settings = get_settings()
if key:
value = settings.get(key)
if value is not None:
if isinstance(value, (dict, list)):
console.print(yaml.dump({key: value}, default_flow_style=False, allow_unicode=True))
else:
console.print(t("config_get_value", key=key, value=value))
else:
print_error(t("config_get_not_found", key=key))
raise typer.Exit(1)
else:
console.print(f"\n[bold]{t('config_show_title')}[/bold]\n")
console.print(f"[dim]{t('config_file_location', path=str(settings.config_path))}[/dim]\n")
config_yaml = yaml.dump(settings.get_all(), default_flow_style=False, allow_unicode=True)
syntax = Syntax(config_yaml, "yaml", theme="monokai", line_numbers=False)
console.print(syntax)
@app.command(name="set")
def set_config(
key: str = typer.Argument(..., help="Configuration key (e.g., server.port)"),
value: str = typer.Argument(..., help="Value to set"),
) -> None:
"""Set a configuration value."""
settings = get_settings()
# Try to parse value as JSON/YAML for complex types
parsed_value = _parse_value(value)
settings.set(key, parsed_value)
print_success(t("config_set_success", key=key, value=parsed_value))
@app.command(name="get")
def get_config(
key: str = typer.Argument(..., help="Configuration key (e.g., server.port)"),
) -> None:
"""Get a configuration value."""
settings = get_settings()
value = settings.get(key)
if value is not None:
if isinstance(value, (dict, list)):
console.print(yaml.dump(value, default_flow_style=False, allow_unicode=True))
else:
console.print(str(value))
else:
print_error(t("config_get_not_found", key=key))
raise typer.Exit(1)
@app.command(name="reset")
def reset(
yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
) -> None:
"""Reset configuration to defaults."""
if not yes:
if not confirm(t("config_reset_confirm"), default=False):
raise typer.Abort()
settings = get_settings()
settings.reset()
print_success(t("config_reset_success"))
@app.command(name="path")
def path() -> None:
"""Show configuration file path."""
settings = get_settings()
console.print(str(settings.config_path))
@app.command(name="model-path-list", deprecated=True, hidden=True)
def model_path_list() -> None:
"""[Deprecated] Use 'kt model path-list' instead."""
console.print("[yellow]⚠ This command is deprecated. Use 'kt model path-list' instead.[/yellow]\n")
import subprocess
subprocess.run(["kt", "model", "path-list"])
@app.command(name="model-path-add", deprecated=True, hidden=True)
def model_path_add(
path: str = typer.Argument(..., help="Path to add"),
) -> None:
"""[Deprecated] Use 'kt model path-add' instead."""
console.print("[yellow]⚠ This command is deprecated. Use 'kt model path-add' instead.[/yellow]\n")
import subprocess
subprocess.run(["kt", "model", "path-add", path])
@app.command(name="model-path-remove", deprecated=True, hidden=True)
def model_path_remove(
path: str = typer.Argument(..., help="Path to remove"),
) -> None:
"""[Deprecated] Use 'kt model path-remove' instead."""
console.print("[yellow]⚠ This command is deprecated. Use 'kt model path-remove' instead.[/yellow]\n")
import subprocess
subprocess.run(["kt", "model", "path-remove", path])
def _parse_value(value: str):
"""Parse a string value into appropriate Python type."""
# Try boolean
if value.lower() in ("true", "yes", "on", "1"):
return True
if value.lower() in ("false", "no", "off", "0"):
return False
# Try integer
try:
return int(value)
except ValueError:
pass
# Try float
try:
return float(value)
except ValueError:
pass
# Try YAML/JSON parsing for lists/dicts
try:
parsed = yaml.safe_load(value)
if isinstance(parsed, (dict, list)):
return parsed
except yaml.YAMLError:
pass
# Return as string
return value

View File

@@ -0,0 +1,394 @@
"""
Doctor command for kt-cli.
Diagnoses environment issues and provides recommendations.
"""
import platform
import shutil
from pathlib import Path
from typing import Optional
import typer
from rich.table import Table
from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import console, print_error, print_info, print_success, print_warning
from kt_kernel.cli.utils.environment import (
check_docker,
detect_available_ram_gb,
detect_cpu_info,
detect_cuda_version,
detect_disk_space_gb,
detect_env_managers,
detect_gpus,
detect_memory_info,
detect_ram_gb,
get_installed_package_version,
)
def doctor(
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed diagnostics"),
) -> None:
"""Diagnose environment issues."""
console.print(f"\n[bold]{t('doctor_title')}[/bold]\n")
issues_found = False
checks = []
# 1. Python version
python_version = platform.python_version()
python_ok = _check_python_version(python_version)
checks.append(
{
"name": t("doctor_check_python"),
"status": "ok" if python_ok else "error",
"value": python_version,
"hint": "Python 3.10+ required" if not python_ok else None,
}
)
if not python_ok:
issues_found = True
# 2. CUDA availability
cuda_version = detect_cuda_version()
checks.append(
{
"name": t("doctor_check_cuda"),
"status": "ok" if cuda_version else "warning",
"value": cuda_version or t("version_cuda_not_found"),
"hint": "CUDA is optional but recommended for GPU acceleration" if not cuda_version else None,
}
)
# 3. GPU detection
gpus = detect_gpus()
if gpus:
gpu_names = ", ".join(g.name for g in gpus)
total_vram = sum(g.vram_gb for g in gpus)
checks.append(
{
"name": t("doctor_check_gpu"),
"status": "ok",
"value": t("doctor_gpu_found", count=len(gpus), names=gpu_names),
"hint": f"Total VRAM: {total_vram}GB",
}
)
else:
checks.append(
{
"name": t("doctor_check_gpu"),
"status": "warning",
"value": t("doctor_gpu_not_found"),
"hint": "GPU recommended for best performance",
}
)
# 4. CPU information
cpu_info = detect_cpu_info()
checks.append(
{
"name": t("doctor_check_cpu"),
"status": "ok",
"value": t("doctor_cpu_info", name=cpu_info.name, cores=cpu_info.cores, threads=cpu_info.threads),
"hint": None,
}
)
# 5. CPU instruction sets (critical for kt-kernel)
isa_list = cpu_info.instruction_sets
# Check for recommended instruction sets
recommended_isa = {"AVX2", "AVX512F", "AMX-INT8"}
has_recommended = bool(set(isa_list) & recommended_isa)
has_avx2 = "AVX2" in isa_list
has_avx512 = any(isa.startswith("AVX512") for isa in isa_list)
has_amx = any(isa.startswith("AMX") for isa in isa_list)
# Determine status and build display string
if has_amx:
isa_status = "ok"
isa_hint = "AMX available - best performance for INT4/INT8"
elif has_avx512:
isa_status = "ok"
isa_hint = "AVX512 available - good performance"
elif has_avx2:
isa_status = "warning"
isa_hint = "AVX2 only - consider upgrading CPU for better performance"
else:
isa_status = "error"
isa_hint = "AVX2 required for kt-kernel"
# Show top instruction sets (prioritize important ones)
display_isa = isa_list[:8] if len(isa_list) > 8 else isa_list
isa_display = ", ".join(display_isa)
if len(isa_list) > 8:
isa_display += f" (+{len(isa_list) - 8} more)"
checks.append(
{
"name": t("doctor_check_cpu_isa"),
"status": isa_status,
"value": isa_display if isa_display else "None detected",
"hint": isa_hint,
}
)
# 6. NUMA topology
numa_detail = []
for node, cpus in sorted(cpu_info.numa_info.items()):
if len(cpus) > 6:
cpu_str = f"{cpus[0]}-{cpus[-1]}"
else:
cpu_str = ",".join(str(c) for c in cpus)
numa_detail.append(f"{node}: {cpu_str}")
numa_value = t("doctor_numa_info", nodes=cpu_info.numa_nodes)
if verbose and numa_detail:
numa_value += " (" + "; ".join(numa_detail) + ")"
checks.append(
{
"name": t("doctor_check_numa"),
"status": "ok",
"value": numa_value,
"hint": f"{cpu_info.threads // cpu_info.numa_nodes} threads per node" if cpu_info.numa_nodes > 1 else None,
}
)
# 7. System memory (with frequency if available)
mem_info = detect_memory_info()
if mem_info.frequency_mhz and mem_info.type:
mem_value = t(
"doctor_memory_freq",
available=f"{mem_info.available_gb}GB",
total=f"{mem_info.total_gb}GB",
freq=mem_info.frequency_mhz,
type=mem_info.type,
)
else:
mem_value = t("doctor_memory_info", available=f"{mem_info.available_gb}GB", total=f"{mem_info.total_gb}GB")
ram_ok = mem_info.total_gb >= 32
checks.append(
{
"name": t("doctor_check_memory"),
"status": "ok" if ram_ok else "warning",
"value": mem_value,
"hint": "32GB+ RAM recommended for large models" if not ram_ok else None,
}
)
# 8. Disk space - check all model paths
settings = get_settings()
model_paths = settings.get_model_paths()
# Check all configured model paths
for i, disk_path in enumerate(model_paths):
available_disk, total_disk = detect_disk_space_gb(str(disk_path))
disk_ok = available_disk >= 100
# For multiple paths, add index to name
path_label = f"Model Path {i+1}" if len(model_paths) > 1 else t("doctor_check_disk")
checks.append(
{
"name": path_label,
"status": "ok" if disk_ok else "warning",
"value": t("doctor_disk_info", available=f"{available_disk}GB", path=str(disk_path)),
"hint": "100GB+ free space recommended for model storage" if not disk_ok else None,
}
)
# 6. Required packages
packages = [
("kt-kernel", ">=0.4.0", False), # name, version_req, required
("ktransformers", ">=0.4.0", False),
("sglang", ">=0.4.0", False),
("torch", ">=2.4.0", True),
("transformers", ">=4.45.0", True),
]
package_issues = []
for pkg_name, version_req, required in packages:
version = get_installed_package_version(pkg_name)
if version:
package_issues.append((pkg_name, version, "ok"))
elif required:
package_issues.append((pkg_name, t("version_not_installed"), "error"))
issues_found = True
else:
package_issues.append((pkg_name, t("version_not_installed"), "warning"))
if verbose:
checks.append(
{
"name": t("doctor_check_packages"),
"status": "ok" if not any(p[2] == "error" for p in package_issues) else "error",
"value": f"{sum(1 for p in package_issues if p[2] == 'ok')}/{len(package_issues)} installed",
"packages": package_issues,
}
)
# 7. SGLang installation source check
from kt_kernel.cli.utils.sglang_checker import check_sglang_installation, check_sglang_kt_kernel_support
sglang_info = check_sglang_installation()
if sglang_info["installed"]:
if sglang_info["from_source"]:
if sglang_info["git_info"]:
git_remote = sglang_info["git_info"].get("remote", "unknown")
git_branch = sglang_info["git_info"].get("branch", "unknown")
sglang_source_value = f"Source (GitHub: {git_remote}, branch: {git_branch})"
sglang_source_status = "ok"
sglang_source_hint = None
else:
sglang_source_value = "Source (editable)"
sglang_source_status = "ok"
sglang_source_hint = None
else:
sglang_source_value = "PyPI (not recommended)"
sglang_source_status = "warning"
sglang_source_hint = t("sglang_pypi_hint")
else:
sglang_source_value = "Not installed"
sglang_source_status = "warning"
sglang_source_hint = t("sglang_install_hint")
checks.append(
{
"name": "SGLang Source",
"status": sglang_source_status,
"value": sglang_source_value,
"hint": sglang_source_hint,
}
)
# 7b. SGLang kt-kernel support check (only if SGLang is installed)
kt_kernel_support = {"supported": True} # Default to True if not checked
if sglang_info["installed"]:
# Use cache=False to force re-check in doctor, but silent=True since we show in table
kt_kernel_support = check_sglang_kt_kernel_support(use_cache=False, silent=True)
if kt_kernel_support["supported"]:
kt_kernel_value = t("sglang_kt_kernel_supported")
kt_kernel_status = "ok"
kt_kernel_hint = None
else:
kt_kernel_value = t("sglang_kt_kernel_not_supported")
kt_kernel_status = "error"
kt_kernel_hint = 'Reinstall SGLang from: git clone https://github.com/kvcache-ai/sglang && cd sglang && pip install -e "python[all]"'
issues_found = True
checks.append(
{
"name": "SGLang kt-kernel",
"status": kt_kernel_status,
"value": kt_kernel_value,
"hint": kt_kernel_hint,
}
)
# 8. Environment managers
env_managers = detect_env_managers()
docker = check_docker()
env_list = [f"{m.name} {m.version}" for m in env_managers]
if docker:
env_list.append(f"docker {docker.version}")
checks.append(
{
"name": "Environment Managers",
"status": "ok" if env_list else "warning",
"value": ", ".join(env_list) if env_list else "None found",
"hint": "conda or docker recommended for installation" if not env_list else None,
}
)
# Display results
_display_results(checks, verbose)
# Show SGLang installation instructions if not installed
if not sglang_info["installed"]:
from kt_kernel.cli.utils.sglang_checker import print_sglang_install_instructions
console.print()
print_sglang_install_instructions()
# Show kt-kernel installation instructions if SGLang is installed but doesn't support kt-kernel
elif sglang_info["installed"] and not kt_kernel_support.get("supported", True):
from kt_kernel.cli.utils.sglang_checker import print_sglang_kt_kernel_instructions
console.print()
print_sglang_kt_kernel_instructions()
# Summary
console.print()
if issues_found:
print_warning(t("doctor_has_issues"))
else:
print_success(t("doctor_all_ok"))
console.print()
def _check_python_version(version: str) -> bool:
"""Check if Python version meets requirements."""
parts = version.split(".")
try:
major, minor = int(parts[0]), int(parts[1])
return major >= 3 and minor >= 10
except (IndexError, ValueError):
return False
def _display_results(checks: list[dict], verbose: bool) -> None:
"""Display diagnostic results."""
table = Table(show_header=True, header_style="bold")
table.add_column("Check", style="bold")
table.add_column("Status", width=8)
table.add_column("Value")
if verbose:
table.add_column("Notes", style="dim")
for check in checks:
status = check["status"]
if status == "ok":
status_str = f"[green]{t('doctor_status_ok')}[/green]"
elif status == "warning":
status_str = f"[yellow]{t('doctor_status_warning')}[/yellow]"
else:
status_str = f"[red]{t('doctor_status_error')}[/red]"
if verbose:
table.add_row(
check["name"],
status_str,
check["value"],
check.get("hint", ""),
)
else:
table.add_row(
check["name"],
status_str,
check["value"],
)
# Show package details if verbose
if verbose and "packages" in check:
for pkg_name, pkg_version, pkg_status in check["packages"]:
if pkg_status == "ok":
pkg_status_str = "[green]✓[/green]"
elif pkg_status == "warning":
pkg_status_str = "[yellow]○[/yellow]"
else:
pkg_status_str = "[red]✗[/red]"
table.add_row(
f" └─ {pkg_name}",
pkg_status_str,
pkg_version,
"",
)
console.print(table)

View File

@@ -0,0 +1,409 @@
"""
Model command for kt-cli.
Manages models: download, list, and storage paths.
"""
import os
from pathlib import Path
from typing import Optional
import typer
from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import (
confirm,
console,
print_error,
print_info,
print_success,
print_warning,
prompt_choice,
)
app = typer.Typer(
help="Manage models and storage paths",
invoke_without_command=True,
no_args_is_help=False,
)
@app.callback()
def callback(ctx: typer.Context) -> None:
"""
Model management commands.
Run without arguments to see available models.
"""
# If no subcommand is provided, show the model list
if ctx.invoked_subcommand is None:
show_model_list()
def show_model_list() -> None:
"""Display available models with their status and paths."""
from rich.table import Table
from kt_kernel.cli.utils.model_registry import get_registry
from kt_kernel.cli.i18n import get_lang
registry = get_registry()
settings = get_settings()
console.print()
console.print(f"[bold cyan]{t('model_supported_title')}[/bold cyan]\n")
# Get local models mapping
local_models = {m.name: p for m, p in registry.find_local_models()}
# Create table
table = Table(show_header=True, header_style="bold")
table.add_column(t("model_column_model"), style="cyan", no_wrap=True)
table.add_column(t("model_column_status"), justify="center")
all_models = registry.list_all()
for model in all_models:
if model.name in local_models:
status = f"[green]✓ {t('model_status_local')}[/green]"
else:
status = "[dim]-[/dim]"
table.add_row(model.name, status)
console.print(table)
console.print()
# Usage instructions
console.print(f"[bold]{t('model_usage_title')}:[/bold]")
console.print(f"{t('model_usage_download')} [cyan]kt model download <model-name>[/cyan]")
console.print(f"{t('model_usage_list_local')} [cyan]kt model list --local[/cyan]")
console.print(f"{t('model_usage_search')} [cyan]kt model search <query>[/cyan]")
console.print()
# Show model storage paths
model_paths = settings.get_model_paths()
console.print(f"[bold]{t('model_storage_paths_title')}:[/bold]")
for path in model_paths:
marker = "[green]✓[/green]" if path.exists() else "[dim]✗[/dim]"
console.print(f" {marker} {path}")
console.print()
@app.command(name="download")
def download(
model: Optional[str] = typer.Argument(
None,
help="Model name or HuggingFace repo (e.g., deepseek-v3, Qwen/Qwen3-30B)",
),
path: Optional[Path] = typer.Option(
None,
"--path",
"-p",
help="Custom download path",
),
list_models: bool = typer.Option(
False,
"--list",
"-l",
help="List available models",
),
resume: bool = typer.Option(
True,
"--resume/--no-resume",
help="Resume incomplete downloads",
),
yes: bool = typer.Option(
False,
"--yes",
"-y",
help="Skip confirmation prompts",
),
) -> None:
"""Download model weights from HuggingFace."""
import subprocess
from kt_kernel.cli.i18n import get_lang
from kt_kernel.cli.utils.console import print_model_table, print_step
from kt_kernel.cli.utils.model_registry import get_registry
settings = get_settings()
registry = get_registry()
console.print()
# List mode
if list_models or model is None:
print_step(t("download_list_title"))
console.print()
models = registry.list_all()
model_dicts = []
for m in models:
lang = get_lang()
desc = m.description_zh if lang == "zh" and m.description_zh else m.description
model_dicts.append(
{
"name": m.name,
"hf_repo": m.hf_repo,
"type": m.type,
"gpu_vram_gb": m.gpu_vram_gb,
"cpu_ram_gb": m.cpu_ram_gb,
}
)
print_model_table(model_dicts)
console.print()
if model is None:
console.print(f"[dim]{t('model_download_usage_hint')}[/dim]")
console.print()
return
# Search for model
print_step(t("download_searching", name=model))
# Check if it's a direct HuggingFace repo path
if "/" in model:
hf_repo = model
model_info = None
model_name = model.split("/")[-1]
else:
matches = registry.search(model)
if not matches:
print_error(t("run_model_not_found", name=model))
console.print()
console.print(t("model_download_list_hint"))
console.print(t("model_download_hf_hint"))
raise typer.Exit(1)
if len(matches) == 1:
model_info = matches[0]
else:
console.print()
print_info(t("download_multiple_found"))
choices = [f"{m.name} ({m.hf_repo})" for m in matches]
selected = prompt_choice(t("download_select"), choices)
idx = choices.index(selected)
model_info = matches[idx]
hf_repo = model_info.hf_repo
model_name = model_info.name
print_success(t("download_found", name=hf_repo))
# Determine download path
if path is None:
download_path = settings.models_dir / model_name.replace(" ", "-")
else:
download_path = path
console.print()
print_info(t("download_destination", path=str(download_path)))
# Check if already exists
if download_path.exists() and (download_path / "config.json").exists():
print_warning(t("download_already_exists", path=str(download_path)))
if not yes:
if not confirm(t("download_overwrite_prompt"), default=False):
raise typer.Abort()
# Confirm download
if not yes:
console.print()
if not confirm(t("prompt_continue")):
raise typer.Abort()
# Download using huggingface-cli
console.print()
print_step(t("download_starting"))
cmd = [
"huggingface-cli",
"download",
hf_repo,
"--local-dir",
str(download_path),
]
if resume:
cmd.append("--resume-download")
# Add mirror if configured
mirror = settings.get("download.mirror", "")
if mirror:
cmd.extend(["--endpoint", mirror])
try:
process = subprocess.run(cmd, check=True)
console.print()
print_success(t("download_complete"))
console.print()
console.print(f" {t('model_saved_to', path=download_path)}")
console.print()
console.print(f" {t('model_start_with', name=model_name)}")
console.print()
except subprocess.CalledProcessError as e:
print_error(t("model_download_failed", error=str(e)))
raise typer.Exit(1)
except FileNotFoundError:
print_error(t("model_hf_cli_not_found"))
raise typer.Exit(1)
@app.command(name="list")
def list_models(
local_only: bool = typer.Option(False, "--local", help="Show only locally downloaded models"),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed info including paths"),
) -> None:
"""List available models."""
from rich.table import Table
from kt_kernel.cli.utils.model_registry import get_registry
registry = get_registry()
console.print()
if local_only:
# Show only local models
local_models = registry.find_local_models()
if not local_models:
print_warning(t("model_no_local_models"))
console.print()
console.print(f" {t('model_download_hint')} [cyan]kt model download <model-name>[/cyan]")
console.print()
return
table = Table(title=t("model_local_models_title"), show_header=True, header_style="bold")
table.add_column(t("model_column_model"), style="cyan", no_wrap=True)
if verbose:
table.add_column(t("model_column_local_path"), style="dim")
for model_info, model_path in local_models:
if verbose:
table.add_row(model_info.name, str(model_path))
else:
table.add_row(model_info.name)
console.print(table)
else:
# Show all registered models
all_models = registry.list_all()
local_models_dict = {m.name: p for m, p in registry.find_local_models()}
table = Table(title=t("model_available_models_title"), show_header=True, header_style="bold")
table.add_column(t("model_column_model"), style="cyan", no_wrap=True)
table.add_column(t("model_column_status"), justify="center")
if verbose:
table.add_column(t("model_column_local_path"), style="dim")
for model in all_models:
if model.name in local_models_dict:
status = f"[green]✓ {t('model_status_local')}[/green]"
local_path = str(local_models_dict[model.name])
else:
status = "[dim]-[/dim]"
local_path = f"[dim]{t('model_status_not_downloaded')}[/dim]"
if verbose:
table.add_row(model.name, status, local_path)
else:
table.add_row(model.name, status)
console.print(table)
console.print()
@app.command(name="path-list")
def path_list() -> None:
"""List all configured model storage paths."""
settings = get_settings()
model_paths = settings.get_model_paths()
console.print()
console.print(f"[bold]{t('model_storage_paths_title')}:[/bold]\n")
for i, path in enumerate(model_paths, 1):
marker = "[green]✓[/green]" if path.exists() else "[red]✗[/red]"
console.print(f" {marker} [{i}] {path}")
console.print()
@app.command(name="path-add")
def path_add(
path: str = typer.Argument(..., help="Path to add"),
) -> None:
"""Add a new model storage path."""
# Expand user home directory
path = os.path.expanduser(path)
# Check if path exists or can be created
path_obj = Path(path)
if not path_obj.exists():
console.print(f"[yellow]{t('model_path_not_exist', path=path)}[/yellow]")
if confirm(t("model_create_directory", path=path), default=True):
try:
path_obj.mkdir(parents=True, exist_ok=True)
console.print(f"[green]✓[/green] {t('model_created_directory', path=path)}")
except (OSError, PermissionError) as e:
print_error(t("model_create_dir_failed", error=str(e)))
raise typer.Exit(1)
else:
raise typer.Abort()
# Add to configuration
settings = get_settings()
settings.add_model_path(path)
print_success(t("model_path_added", path=path))
@app.command(name="path-remove")
def path_remove(
path: str = typer.Argument(..., help="Path to remove"),
) -> None:
"""Remove a model storage path from configuration."""
# Expand user home directory
path = os.path.expanduser(path)
settings = get_settings()
if settings.remove_model_path(path):
print_success(t("model_path_removed", path=path))
else:
print_error(t("model_path_not_found", path=path))
raise typer.Exit(1)
@app.command(name="search")
def search(
query: str = typer.Argument(..., help="Search query (model name or keyword)"),
) -> None:
"""Search for models in the registry."""
from rich.table import Table
from kt_kernel.cli.utils.model_registry import get_registry
registry = get_registry()
matches = registry.search(query)
console.print()
if not matches:
print_warning(t("model_search_no_results", query=query))
console.print()
return
table = Table(title=t("model_search_results_title", query=query), show_header=True)
table.add_column(t("model_column_name"), style="cyan")
table.add_column(t("model_column_hf_repo"), style="dim")
table.add_column(t("model_column_aliases"), style="yellow")
for model in matches:
aliases = ", ".join(model.aliases[:3])
if len(model.aliases) > 3:
aliases += f" +{len(model.aliases) - 3} more"
table.add_row(model.name, model.hf_repo, aliases)
console.print(table)
console.print()

View File

@@ -0,0 +1,239 @@
"""
Quant command for kt-cli.
Quantizes model weights for CPU inference.
"""
import subprocess
import sys
from enum import Enum
from pathlib import Path
from typing import Optional
import typer
from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import (
confirm,
console,
create_progress,
print_error,
print_info,
print_step,
print_success,
print_warning,
)
from kt_kernel.cli.utils.environment import detect_cpu_info
class QuantMethod(str, Enum):
"""Quantization method."""
INT4 = "int4"
INT8 = "int8"
def quant(
model: str = typer.Argument(
...,
help="Model name or path to quantize",
),
method: QuantMethod = typer.Option(
QuantMethod.INT4,
"--method",
"-m",
help="Quantization method",
),
output: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Output path for quantized weights",
),
input_type: str = typer.Option(
"fp8",
"--input-type",
"-i",
help="Input weight type (fp8, fp16, bf16)",
),
cpu_threads: Optional[int] = typer.Option(
None,
"--cpu-threads",
help="Number of CPU threads for quantization",
),
numa_nodes: Optional[int] = typer.Option(
None,
"--numa-nodes",
help="Number of NUMA nodes",
),
no_merge: bool = typer.Option(
False,
"--no-merge",
help="Don't merge safetensor files",
),
yes: bool = typer.Option(
False,
"--yes",
"-y",
help="Skip confirmation prompts",
),
) -> None:
"""Quantize model weights for CPU inference."""
settings = get_settings()
console.print()
# Resolve input path
input_path = _resolve_input_path(model, settings)
if input_path is None:
print_error(t("quant_input_not_found", path=model))
raise typer.Exit(1)
print_info(t("quant_input_path", path=str(input_path)))
# Resolve output path
if output is None:
output = input_path.parent / f"{input_path.name}-{method.value.upper()}"
print_info(t("quant_output_path", path=str(output)))
print_info(t("quant_method", method=method.value.upper()))
# Detect CPU configuration
cpu = detect_cpu_info()
final_cpu_threads = cpu_threads or cpu.cores
final_numa_nodes = numa_nodes or cpu.numa_nodes
print_info(f"CPU threads: {final_cpu_threads}")
print_info(f"NUMA nodes: {final_numa_nodes}")
# Check if output exists
if output.exists():
print_warning(f"Output path already exists: {output}")
if not yes:
if not confirm("Overwrite?", default=False):
raise typer.Abort()
# Confirm
if not yes:
console.print()
console.print("[bold]Quantization Settings:[/bold]")
console.print(f" Input: {input_path}")
console.print(f" Output: {output}")
console.print(f" Method: {method.value.upper()}")
console.print(f" Input type: {input_type}")
console.print()
print_warning("Quantization may take 30-60 minutes depending on model size.")
console.print()
if not confirm(t("prompt_continue")):
raise typer.Abort()
# Find conversion script
kt_kernel_path = _find_kt_kernel_path()
if kt_kernel_path is None:
print_error("kt-kernel not found. Install with: kt install inference")
raise typer.Exit(1)
script_path = kt_kernel_path / "scripts" / "convert_cpu_weights.py"
if not script_path.exists():
print_error(f"Conversion script not found: {script_path}")
raise typer.Exit(1)
# Build command
cmd = [
sys.executable, str(script_path),
"--input-path", str(input_path),
"--input-type", input_type,
"--output", str(output),
"--quant-method", method.value,
"--cpuinfer-threads", str(final_cpu_threads),
"--threadpool-count", str(final_numa_nodes),
]
if no_merge:
cmd.append("--no-merge-safetensor")
# Run quantization
console.print()
print_step(t("quant_starting"))
console.print()
console.print(f"[dim]$ {' '.join(cmd)}[/dim]")
console.print()
try:
process = subprocess.run(cmd)
if process.returncode == 0:
console.print()
print_success(t("quant_complete"))
console.print()
console.print(f" Quantized weights saved to: {output}")
console.print()
console.print(" Use with:")
console.print(f" kt run {model} --weights-path {output}")
console.print()
else:
print_error(f"Quantization failed with exit code {process.returncode}")
raise typer.Exit(process.returncode)
except FileNotFoundError as e:
print_error(f"Failed to run quantization: {e}")
raise typer.Exit(1)
except KeyboardInterrupt:
console.print()
print_warning("Quantization interrupted.")
raise typer.Exit(130)
def _resolve_input_path(model: str, settings) -> Optional[Path]:
"""Resolve the input model path."""
# Check if it's already a path
path = Path(model)
if path.exists() and (path / "config.json").exists():
return path
# Search in models directory
from kt_kernel.cli.utils.model_registry import get_registry
registry = get_registry()
matches = registry.search(model)
if matches:
model_info = matches[0]
# Try to find in all configured model directories
model_paths = settings.get_model_paths()
for models_dir in model_paths:
possible_paths = [
models_dir / model_info.name,
models_dir / model_info.name.lower(),
models_dir / model_info.hf_repo.split("/")[-1],
]
for p in possible_paths:
if p.exists() and (p / "config.json").exists():
return p
return None
def _find_kt_kernel_path() -> Optional[Path]:
"""Find the kt-kernel installation path."""
try:
import kt_kernel
return Path(kt_kernel.__file__).parent.parent
except ImportError:
pass
# Check common locations
possible_paths = [
Path.home() / "Projects" / "ktransformers" / "kt-kernel",
Path.cwd().parent / "kt-kernel",
Path.cwd() / "kt-kernel",
]
for path in possible_paths:
if path.exists() and (path / "scripts").exists():
return path
return None

View File

@@ -0,0 +1,831 @@
"""
Run command for kt-cli.
Starts the model inference server using SGLang + kt-kernel.
"""
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional
import typer
from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import (
confirm,
console,
print_api_info,
print_error,
print_info,
print_server_info,
print_step,
print_success,
print_warning,
prompt_choice,
)
from kt_kernel.cli.utils.environment import detect_cpu_info, detect_gpus, detect_ram_gb
from kt_kernel.cli.utils.model_registry import MODEL_COMPUTE_FUNCTIONS, ModelInfo, get_registry
def run(
model: Optional[str] = typer.Argument(
None,
help="Model name or path (e.g., deepseek-v3, qwen3-30b). If not specified, shows interactive selection.",
),
host: str = typer.Option(
None,
"--host",
"-H",
help="Server host address",
),
port: int = typer.Option(
None,
"--port",
"-p",
help="Server port",
),
# CPU/GPU configuration
gpu_experts: Optional[int] = typer.Option(
None,
"--gpu-experts",
help="Number of GPU experts per layer",
),
cpu_threads: Optional[int] = typer.Option(
None,
"--cpu-threads",
help="Number of CPU inference threads (kt-cpuinfer, defaults to 80% of CPU cores)",
),
numa_nodes: Optional[int] = typer.Option(
None,
"--numa-nodes",
help="Number of NUMA nodes",
),
tensor_parallel_size: Optional[int] = typer.Option(
None,
"--tensor-parallel-size",
"--tp",
help="Tensor parallel size (number of GPUs)",
),
# Model paths
model_path: Optional[Path] = typer.Option(
None,
"--model-path",
help="Custom model path",
),
weights_path: Optional[Path] = typer.Option(
None,
"--weights-path",
help="Custom quantized weights path",
),
# KT-kernel options
kt_method: Optional[str] = typer.Option(
None,
"--kt-method",
help="KT quantization method (AMXINT4, RAWFP8, etc.)",
),
kt_gpu_prefill_token_threshold: Optional[int] = typer.Option(
None,
"--kt-gpu-prefill-threshold",
help="GPU prefill token threshold for kt-kernel",
),
# SGLang options
attention_backend: Optional[str] = typer.Option(
None,
"--attention-backend",
help="Attention backend (triton, flashinfer)",
),
max_total_tokens: Optional[int] = typer.Option(
None,
"--max-total-tokens",
help="Maximum total tokens",
),
max_running_requests: Optional[int] = typer.Option(
None,
"--max-running-requests",
help="Maximum running requests",
),
chunked_prefill_size: Optional[int] = typer.Option(
None,
"--chunked-prefill-size",
help="Chunked prefill size",
),
mem_fraction_static: Optional[float] = typer.Option(
None,
"--mem-fraction-static",
help="Memory fraction for static allocation",
),
watchdog_timeout: Optional[int] = typer.Option(
None,
"--watchdog-timeout",
help="Watchdog timeout in seconds",
),
served_model_name: Optional[str] = typer.Option(
None,
"--served-model-name",
help="Custom model name for API responses",
),
# Performance flags
disable_shared_experts_fusion: Optional[bool] = typer.Option(
None,
"--disable-shared-experts-fusion/--enable-shared-experts-fusion",
help="Disable/enable shared experts fusion",
),
# Other options
quantize: bool = typer.Option(
False,
"--quantize",
"-q",
help="Quantize model if weights not found",
),
advanced: bool = typer.Option(
False,
"--advanced",
help="Show advanced options",
),
dry_run: bool = typer.Option(
False,
"--dry-run",
help="Show command without executing",
),
) -> None:
"""Start model inference server."""
# Check if SGLang is installed before proceeding
from kt_kernel.cli.utils.sglang_checker import (
check_sglang_installation,
check_sglang_kt_kernel_support,
print_sglang_install_instructions,
print_sglang_kt_kernel_instructions,
)
sglang_info = check_sglang_installation()
if not sglang_info["installed"]:
console.print()
print_error(t("sglang_not_found"))
console.print()
print_sglang_install_instructions()
raise typer.Exit(1)
# Check if SGLang supports kt-kernel (has --kt-gpu-prefill-token-threshold parameter)
kt_kernel_support = check_sglang_kt_kernel_support()
if not kt_kernel_support["supported"]:
console.print()
print_error(t("sglang_kt_kernel_not_supported"))
console.print()
print_sglang_kt_kernel_instructions()
raise typer.Exit(1)
settings = get_settings()
registry = get_registry()
console.print()
# If no model specified, show interactive selection
if model is None:
model = _interactive_model_selection(registry, settings)
if model is None:
raise typer.Exit(0)
# Step 1: Detect hardware
print_step(t("run_detecting_hardware"))
gpus = detect_gpus()
cpu = detect_cpu_info()
ram = detect_ram_gb()
if gpus:
gpu_info = f"{gpus[0].name} ({gpus[0].vram_gb}GB VRAM)"
if len(gpus) > 1:
gpu_info += f" + {len(gpus) - 1} more"
print_info(t("run_gpu_info", name=gpus[0].name, vram=gpus[0].vram_gb))
else:
print_warning(t("doctor_gpu_not_found"))
gpu_info = "None"
print_info(t("run_cpu_info", name=cpu.name, cores=cpu.cores, numa=cpu.numa_nodes))
print_info(t("run_ram_info", total=int(ram)))
# Step 2: Resolve model
console.print()
print_step(t("run_checking_model"))
model_info = None
resolved_model_path = model_path
# Check if model is a path
if Path(model).exists():
resolved_model_path = Path(model)
print_info(t("run_model_path", path=str(resolved_model_path)))
# Try to infer model type from path to use default configurations
# Check directory name against known models
dir_name = resolved_model_path.name.lower()
for registered_model in registry.list_all():
# Check if directory name matches model name or aliases
if dir_name == registered_model.name.lower():
model_info = registered_model
print_info(f"Detected model type: {registered_model.name}")
break
for alias in registered_model.aliases:
if dir_name == alias.lower() or alias.lower() in dir_name:
model_info = registered_model
print_info(f"Detected model type: {registered_model.name}")
break
if model_info:
break
# Also check HuggingFace repo format (org--model)
if not model_info:
for registered_model in registry.list_all():
repo_slug = registered_model.hf_repo.replace("/", "--").lower()
if repo_slug in dir_name or dir_name in repo_slug:
model_info = registered_model
print_info(f"Detected model type: {registered_model.name}")
break
if not model_info:
print_warning("Could not detect model type from path. Using default parameters.")
console.print(" [dim]Tip: Use model name (e.g., 'kt run m2') to apply optimized configurations[/dim]")
else:
# Search in registry
matches = registry.search(model)
if not matches:
print_error(t("run_model_not_found", name=model))
console.print()
console.print("Available models:")
for m in registry.list_all()[:5]:
console.print(f" - {m.name} ({', '.join(m.aliases[:2])})")
raise typer.Exit(1)
if len(matches) == 1:
model_info = matches[0]
else:
# Multiple matches - prompt user
console.print()
print_info(t("run_multiple_matches"))
choices = [f"{m.name} ({m.hf_repo})" for m in matches]
selected = prompt_choice(t("run_select_model"), choices)
idx = choices.index(selected)
model_info = matches[idx]
# Find model path
if model_path is None:
resolved_model_path = _find_model_path(model_info, settings)
if resolved_model_path is None:
print_error(t("run_model_not_found", name=model_info.name))
console.print()
console.print(
f" Download with: kt download {model_info.aliases[0] if model_info.aliases else model_info.name}"
)
raise typer.Exit(1)
print_info(t("run_model_path", path=str(resolved_model_path)))
# Step 3: Check quantized weights (only if explicitly requested)
resolved_weights_path = None
# Only use quantized weights if explicitly specified by user
if weights_path is not None:
# User explicitly specified weights path
resolved_weights_path = weights_path
if not resolved_weights_path.exists():
print_error(t("run_weights_not_found"))
console.print(f" Path: {resolved_weights_path}")
raise typer.Exit(1)
print_info(f"Using quantized weights: {resolved_weights_path}")
elif quantize:
# User requested quantization
console.print()
print_step(t("run_quantizing"))
# TODO: Implement quantization
print_warning("Quantization not yet implemented. Please run 'kt quant' manually.")
raise typer.Exit(1)
else:
# Default: use original precision model without quantization
console.print()
print_info("Using original precision model (no quantization)")
# Step 4: Build command
# Resolve all parameters (CLI > model defaults > config > auto-detect)
final_host = host or settings.get("server.host", "0.0.0.0")
final_port = port or settings.get("server.port", 30000)
# Get defaults from model info if available
model_defaults = model_info.default_params if model_info else {}
# Determine tensor parallel size first (needed for GPU expert calculation)
# Priority: CLI > model defaults > config > auto-detect (with model constraints)
# Check if explicitly specified by user or configuration
explicitly_specified = (
tensor_parallel_size # CLI argument (highest priority)
or model_defaults.get("tensor-parallel-size") # Model defaults
or settings.get("inference.tensor_parallel_size") # Config file
)
if explicitly_specified:
# Use explicitly specified value
requested_tensor_parallel_size = explicitly_specified
else:
# Auto-detect from GPUs, considering model's max constraint
detected_gpu_count = len(gpus) if gpus else 1
if model_info and model_info.max_tensor_parallel_size is not None:
# Automatically limit to model's maximum to use as many GPUs as possible
requested_tensor_parallel_size = min(detected_gpu_count, model_info.max_tensor_parallel_size)
else:
requested_tensor_parallel_size = detected_gpu_count
# Apply model's max_tensor_parallel_size constraint if explicitly specified value exceeds it
final_tensor_parallel_size = requested_tensor_parallel_size
if model_info and model_info.max_tensor_parallel_size is not None:
if requested_tensor_parallel_size > model_info.max_tensor_parallel_size:
console.print()
print_warning(
f"Model {model_info.name} only supports up to {model_info.max_tensor_parallel_size}-way "
f"tensor parallelism, but {requested_tensor_parallel_size} was requested. "
f"Reducing to {model_info.max_tensor_parallel_size}."
)
final_tensor_parallel_size = model_info.max_tensor_parallel_size
# CPU/GPU configuration with smart defaults
# kt-cpuinfer: default to 80% of total CPU threads (cores * NUMA nodes)
total_threads = cpu.cores * cpu.numa_nodes
final_cpu_threads = (
cpu_threads
or model_defaults.get("kt-cpuinfer")
or settings.get("inference.cpu_threads")
or int(total_threads * 0.8)
)
# kt-threadpool-count: default to NUMA node count
final_numa_nodes = (
numa_nodes
or model_defaults.get("kt-threadpool-count")
or settings.get("inference.numa_nodes")
or cpu.numa_nodes
)
# kt-num-gpu-experts: use model-specific computation if available and not explicitly set
if gpu_experts is not None:
# User explicitly set it
final_gpu_experts = gpu_experts
elif model_info and model_info.name in MODEL_COMPUTE_FUNCTIONS and gpus:
# Use model-specific computation function (only if GPUs detected)
vram_per_gpu = gpus[0].vram_gb
compute_func = MODEL_COMPUTE_FUNCTIONS[model_info.name]
final_gpu_experts = compute_func(final_tensor_parallel_size, vram_per_gpu)
console.print()
print_info(
f"Auto-computed kt-num-gpu-experts: {final_gpu_experts} (TP={final_tensor_parallel_size}, VRAM={vram_per_gpu}GB per GPU)"
)
else:
# Fall back to defaults
final_gpu_experts = model_defaults.get("kt-num-gpu-experts") or settings.get("inference.gpu_experts", 1)
# KT-kernel options
final_kt_method = kt_method or model_defaults.get("kt-method") or settings.get("inference.kt_method", "AMXINT4")
final_kt_gpu_prefill_threshold = (
kt_gpu_prefill_token_threshold
or model_defaults.get("kt-gpu-prefill-token-threshold")
or settings.get("inference.kt_gpu_prefill_token_threshold", 4096)
)
# SGLang options
final_attention_backend = (
attention_backend
or model_defaults.get("attention-backend")
or settings.get("inference.attention_backend", "triton")
)
final_max_total_tokens = (
max_total_tokens or model_defaults.get("max-total-tokens") or settings.get("inference.max_total_tokens", 40000)
)
final_max_running_requests = (
max_running_requests
or model_defaults.get("max-running-requests")
or settings.get("inference.max_running_requests", 32)
)
final_chunked_prefill_size = (
chunked_prefill_size
or model_defaults.get("chunked-prefill-size")
or settings.get("inference.chunked_prefill_size", 4096)
)
final_mem_fraction_static = (
mem_fraction_static
or model_defaults.get("mem-fraction-static")
or settings.get("inference.mem_fraction_static", 0.98)
)
final_watchdog_timeout = (
watchdog_timeout or model_defaults.get("watchdog-timeout") or settings.get("inference.watchdog_timeout", 3000)
)
final_served_model_name = (
served_model_name or model_defaults.get("served-model-name") or settings.get("inference.served_model_name", "")
)
# Performance flags
if disable_shared_experts_fusion is not None:
final_disable_shared_experts_fusion = disable_shared_experts_fusion
elif "disable-shared-experts-fusion" in model_defaults:
final_disable_shared_experts_fusion = model_defaults["disable-shared-experts-fusion"]
else:
final_disable_shared_experts_fusion = settings.get("inference.disable_shared_experts_fusion", False)
# Pass all model default params to handle any extra parameters
extra_params = model_defaults if model_info else {}
cmd = _build_sglang_command(
model_path=resolved_model_path,
weights_path=resolved_weights_path,
model_info=model_info,
host=final_host,
port=final_port,
gpu_experts=final_gpu_experts,
cpu_threads=final_cpu_threads,
numa_nodes=final_numa_nodes,
tensor_parallel_size=final_tensor_parallel_size,
kt_method=final_kt_method,
kt_gpu_prefill_threshold=final_kt_gpu_prefill_threshold,
attention_backend=final_attention_backend,
max_total_tokens=final_max_total_tokens,
max_running_requests=final_max_running_requests,
chunked_prefill_size=final_chunked_prefill_size,
mem_fraction_static=final_mem_fraction_static,
watchdog_timeout=final_watchdog_timeout,
served_model_name=final_served_model_name,
disable_shared_experts_fusion=final_disable_shared_experts_fusion,
settings=settings,
extra_model_params=extra_params,
)
# Prepare environment variables
env = os.environ.copy()
# Add environment variables from advanced.env
env.update(settings.get_env_vars())
# Add environment variables from inference.env
inference_env = settings.get("inference.env", {})
if isinstance(inference_env, dict):
env.update({k: str(v) for k, v in inference_env.items()})
# Step 5: Show configuration summary
console.print()
print_step("Configuration")
# Model info
if model_info:
console.print(f" Model: [bold]{model_info.name}[/bold]")
else:
console.print(f" Model: [bold]{resolved_model_path.name}[/bold]")
console.print(f" Path: [dim]{resolved_model_path}[/dim]")
# Key parameters
console.print()
console.print(f" GPU Experts: [cyan]{final_gpu_experts}[/cyan] per layer")
console.print(f" CPU Threads (kt-cpuinfer): [cyan]{final_cpu_threads}[/cyan]")
console.print(f" NUMA Nodes (kt-threadpool-count): [cyan]{final_numa_nodes}[/cyan]")
console.print(f" Tensor Parallel: [cyan]{final_tensor_parallel_size}[/cyan]")
console.print(f" Method: [cyan]{final_kt_method}[/cyan]")
console.print(f" Attention: [cyan]{final_attention_backend}[/cyan]")
# Weights info
if resolved_weights_path:
console.print()
console.print(f" Quantized weights: [yellow]{resolved_weights_path}[/yellow]")
console.print()
console.print(f" Server: [green]http://{final_host}:{final_port}[/green]")
console.print()
# Step 6: Show or execute
if dry_run:
console.print()
console.print("[bold]Command:[/bold]")
console.print()
console.print(f" [dim]{' '.join(cmd)}[/dim]")
console.print()
return
# Execute with prepared environment variables
# Don't print "Server started" or API info here - let sglang's logs speak for themselves
# The actual startup takes time and these messages are misleading
# Print the command being executed
console.print()
console.print("[bold]Launching server with command:[/bold]")
console.print()
console.print(f" [dim]{' '.join(cmd)}[/dim]")
console.print()
try:
# Execute directly without intercepting output or signals
# This allows direct output to terminal and Ctrl+C to work naturally
process = subprocess.run(cmd, env=env)
sys.exit(process.returncode)
except FileNotFoundError:
from kt_kernel.cli.utils.sglang_checker import print_sglang_install_instructions
print_error(t("sglang_not_found"))
console.print()
print_sglang_install_instructions()
raise typer.Exit(1)
except Exception as e:
print_error(f"Failed to start server: {e}")
raise typer.Exit(1)
def _find_model_path(model_info: ModelInfo, settings) -> Optional[Path]:
"""Find the model path on disk by searching all configured model paths."""
model_paths = settings.get_model_paths()
# Search in all configured model directories
for models_dir in model_paths:
# Check common path patterns
possible_paths = [
models_dir / model_info.name,
models_dir / model_info.name.lower(),
models_dir / model_info.name.replace(" ", "-"),
models_dir / model_info.hf_repo.split("/")[-1],
models_dir / model_info.hf_repo.replace("/", "--"),
]
# Add alias-based paths
for alias in model_info.aliases:
possible_paths.append(models_dir / alias)
possible_paths.append(models_dir / alias.lower())
for path in possible_paths:
if path.exists() and (path / "config.json").exists():
return path
return None
def _find_weights_path(model_info: ModelInfo, settings) -> Optional[Path]:
"""Find the quantized weights path on disk by searching all configured paths."""
model_paths = settings.get_model_paths()
weights_dir = settings.weights_dir
# Check common patterns
base_names = [
model_info.name,
model_info.name.lower(),
model_info.hf_repo.split("/")[-1],
]
suffixes = ["-INT4", "-int4", "_INT4", "_int4", "-quant", "-quantized"]
# Prepare search directories
search_dirs = [weights_dir] if weights_dir else []
search_dirs.extend(model_paths)
for base in base_names:
for suffix in suffixes:
for dir_path in search_dirs:
if dir_path:
path = dir_path / f"{base}{suffix}"
if path.exists():
return path
return None
def _build_sglang_command(
model_path: Path,
weights_path: Optional[Path],
model_info: Optional[ModelInfo],
host: str,
port: int,
gpu_experts: int,
cpu_threads: int,
numa_nodes: int,
tensor_parallel_size: int,
kt_method: str,
kt_gpu_prefill_threshold: int,
attention_backend: str,
max_total_tokens: int,
max_running_requests: int,
chunked_prefill_size: int,
mem_fraction_static: float,
watchdog_timeout: int,
served_model_name: str,
disable_shared_experts_fusion: bool,
settings,
extra_model_params: Optional[dict] = None, # New parameter for additional params
) -> list[str]:
"""Build the SGLang launch command."""
cmd = [
sys.executable,
"-m",
"sglang.launch_server",
"--host",
host,
"--port",
str(port),
"--model",
str(model_path),
]
# Add kt-kernel options
# kt-kernel is needed for:
# 1. Quantized models (when weights_path is provided)
# 2. MoE models with CPU offloading (when kt-cpuinfer > 0 or kt-num-gpu-experts is configured)
use_kt_kernel = False
# Check if we should use kt-kernel
if weights_path:
# Quantized model - always use kt-kernel
use_kt_kernel = True
elif cpu_threads > 0 or gpu_experts > 1:
# CPU offloading configured - use kt-kernel
use_kt_kernel = True
elif model_info and model_info.type == "moe":
# MoE model - likely needs kt-kernel for expert offloading
use_kt_kernel = True
if use_kt_kernel:
# Add kt-weight-path: use quantized weights if available, otherwise use model path
weight_path_to_use = weights_path if weights_path else model_path
# Add kt-kernel configuration
cmd.extend(
[
"--kt-weight-path",
str(weight_path_to_use),
"--kt-cpuinfer",
str(cpu_threads),
"--kt-threadpool-count",
str(numa_nodes),
"--kt-num-gpu-experts",
str(gpu_experts),
"--kt-method",
kt_method,
"--kt-gpu-prefill-token-threshold",
str(kt_gpu_prefill_threshold),
]
)
# Add SGLang options
cmd.extend(
[
"--attention-backend",
attention_backend,
"--trust-remote-code",
"--mem-fraction-static",
str(mem_fraction_static),
"--chunked-prefill-size",
str(chunked_prefill_size),
"--max-running-requests",
str(max_running_requests),
"--max-total-tokens",
str(max_total_tokens),
"--watchdog-timeout",
str(watchdog_timeout),
"--enable-mixed-chunk",
"--tensor-parallel-size",
str(tensor_parallel_size),
"--enable-p2p-check",
]
)
# Add served model name if specified
if served_model_name:
cmd.extend(["--served-model-name", served_model_name])
# Add performance flags
if disable_shared_experts_fusion:
cmd.append("--disable-shared-experts-fusion")
# Add any extra parameters from model defaults that weren't explicitly handled
if extra_model_params:
# List of parameters already handled above
handled_params = {
"kt-num-gpu-experts",
"kt-cpuinfer",
"kt-threadpool-count",
"kt-method",
"kt-gpu-prefill-token-threshold",
"attention-backend",
"tensor-parallel-size",
"max-total-tokens",
"max-running-requests",
"chunked-prefill-size",
"mem-fraction-static",
"watchdog-timeout",
"served-model-name",
"disable-shared-experts-fusion",
}
for key, value in extra_model_params.items():
if key not in handled_params:
# Add unhandled parameters dynamically
cmd.append(f"--{key}")
if isinstance(value, bool):
# Boolean flags don't need a value
if not value:
# For False boolean, skip the flag entirely
cmd.pop() # Remove the flag we just added
else:
cmd.append(str(value))
# Add extra args from settings
extra_args = settings.get("advanced.sglang_args", [])
if extra_args:
cmd.extend(extra_args)
return cmd
def _interactive_model_selection(registry, settings) -> Optional[str]:
"""Show interactive model selection interface.
Returns:
Selected model name or None if cancelled.
"""
from rich.panel import Panel
from rich.table import Table
from rich.prompt import Prompt
from kt_kernel.cli.i18n import get_lang
lang = get_lang()
# Find local models first
local_models = registry.find_local_models()
# Get all registered models
all_models = registry.list_all()
console.print()
console.print(
Panel.fit(
t("run_select_model_title"),
border_style="cyan",
)
)
console.print()
# Build choices list
choices = []
choice_map = {} # index -> model name
# Section 1: Local models (downloaded)
if local_models:
console.print(f"[bold green]{t('run_local_models')}[/bold green]")
console.print()
for i, (model_info, path) in enumerate(local_models, 1):
desc = model_info.description_zh if lang == "zh" else model_info.description
short_desc = desc[:50] + "..." if len(desc) > 50 else desc
console.print(f" [cyan][{i}][/cyan] [bold]{model_info.name}[/bold]")
console.print(f" [dim]{short_desc}[/dim]")
console.print(f" [dim]{path}[/dim]")
choices.append(str(i))
choice_map[str(i)] = model_info.name
console.print()
# Section 2: All registered models (for reference)
start_idx = len(local_models) + 1
console.print(f"[bold yellow]{t('run_registered_models')}[/bold yellow]")
console.print()
# Filter out already shown local models
local_model_names = {m.name for m, _ in local_models}
for i, model_info in enumerate(all_models, start_idx):
if model_info.name in local_model_names:
continue
desc = model_info.description_zh if lang == "zh" else model_info.description
short_desc = desc[:50] + "..." if len(desc) > 50 else desc
console.print(f" [cyan][{i}][/cyan] [bold]{model_info.name}[/bold]")
console.print(f" [dim]{short_desc}[/dim]")
console.print(f" [dim]{model_info.hf_repo}[/dim]")
choices.append(str(i))
choice_map[str(i)] = model_info.name
console.print()
# Add cancel option
cancel_idx = str(len(choices) + 1)
console.print(f" [cyan][{cancel_idx}][/cyan] [dim]{t('cancel')}[/dim]")
choices.append(cancel_idx)
console.print()
# Prompt for selection
try:
selection = Prompt.ask(
t("run_select_model_prompt"),
choices=choices,
default="1" if choices else cancel_idx,
)
except KeyboardInterrupt:
console.print()
return None
if selection == cancel_idx:
return None
return choice_map.get(selection)

View File

@@ -0,0 +1,52 @@
"""
SFT command for kt-cli.
Fine-tuning with LlamaFactory integration.
"""
import typer
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import console
app = typer.Typer(help="Fine-tuning with LlamaFactory (coming soon)")
@app.callback(invoke_without_command=True)
def callback(ctx: typer.Context) -> None:
"""Fine-tuning commands (coming soon)."""
if ctx.invoked_subcommand is None:
console.print()
console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
console.print()
console.print("[dim]kt sft train - Train a model[/dim]")
console.print("[dim]kt sft chat - Chat with a trained model[/dim]")
console.print("[dim]kt sft export - Export a trained model[/dim]")
console.print()
@app.command(name="train")
def train() -> None:
"""Train a model using LlamaFactory (coming soon)."""
console.print()
console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
console.print()
raise typer.Exit(0)
@app.command(name="chat")
def chat() -> None:
"""Chat with a trained model using LlamaFactory (coming soon)."""
console.print()
console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
console.print()
raise typer.Exit(0)
@app.command(name="export")
def export() -> None:
"""Export a trained model using LlamaFactory (coming soon)."""
console.print()
console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
console.print()
raise typer.Exit(0)

View File

@@ -0,0 +1,118 @@
"""
Version command for kt-cli.
Displays version information for kt-cli and related packages.
"""
import platform
from typing import Optional
import typer
from kt_kernel.cli import __version__
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import console, print_version_table
from kt_kernel.cli.utils.environment import detect_cuda_version, get_installed_package_version
def _get_sglang_info() -> str:
"""Get sglang version and installation source information."""
try:
import sglang
version = getattr(sglang, "__version__", None)
if not version:
version = get_installed_package_version("sglang")
if not version:
return t("version_not_installed")
# Try to detect installation source
from pathlib import Path
import subprocess
if hasattr(sglang, "__file__") and sglang.__file__:
location = Path(sglang.__file__).parent.parent
git_dir = location / ".git"
if git_dir.exists():
# Installed from git (editable install)
try:
# Get remote URL
result = subprocess.run(
["git", "remote", "get-url", "origin"],
cwd=location,
capture_output=True,
text=True,
timeout=2,
)
if result.returncode == 0:
remote_url = result.stdout.strip()
# Simplify GitHub URLs
if "github.com" in remote_url:
repo_name = remote_url.split("/")[-1].replace(".git", "")
owner = remote_url.split("/")[-2]
return f"{version} [dim](GitHub: {owner}/{repo_name})[/dim]"
return f"{version} [dim](Git: {remote_url})[/dim]"
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
pass
# Default: installed from PyPI
return f"{version} [dim](PyPI)[/dim]"
except ImportError:
return t("version_not_installed")
def version(
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed version info"),
) -> None:
"""Show version information."""
console.print(f"\n[bold]{t('version_info')}[/bold] v{__version__}\n")
# Basic info
versions = {
t("version_python"): platform.python_version(),
t("version_platform"): f"{platform.system()} {platform.release()}",
}
# CUDA version
cuda_version = detect_cuda_version()
versions[t("version_cuda")] = cuda_version or t("version_cuda_not_found")
print_version_table(versions)
# Always show key packages with installation source
console.print("\n[bold]Packages:[/bold]\n")
sglang_info = _get_sglang_info()
key_packages = {
t("version_kt_kernel"): get_installed_package_version("kt-kernel") or t("version_not_installed"),
t("version_sglang"): sglang_info,
}
print_version_table(key_packages)
# Show SGLang installation hint if not installed
if sglang_info == t("version_not_installed"):
from kt_kernel.cli.utils.sglang_checker import print_sglang_install_instructions
console.print()
print_sglang_install_instructions()
if verbose:
console.print("\n[bold]Additional Packages:[/bold]\n")
package_versions = {
t("version_ktransformers"): get_installed_package_version("ktransformers") or t("version_not_installed"),
t("version_llamafactory"): get_installed_package_version("llamafactory") or t("version_not_installed"),
"typer": get_installed_package_version("typer") or t("version_not_installed"),
"rich": get_installed_package_version("rich") or t("version_not_installed"),
"torch": get_installed_package_version("torch") or t("version_not_installed"),
"transformers": get_installed_package_version("transformers") or t("version_not_installed"),
}
print_version_table(package_versions)
console.print()

View File

@@ -0,0 +1 @@
"""Shell completion scripts for kt-cli."""

View File

@@ -0,0 +1,153 @@
#compdef kt
# Zsh completion for kt command
# This is a static completion script that doesn't require Python startup
_kt() {
local -a commands
commands=(
'version:Show version information'
'run:Start model inference server'
'chat:Interactive chat with running model'
'quant:Quantize model weights'
'bench:Run full benchmark'
'microbench:Run micro-benchmark'
'doctor:Diagnose environment issues'
'model:Manage models and storage paths'
'config:Manage configuration'
'sft:Fine-tuning with LlamaFactory'
)
local -a run_opts
run_opts=(
'--host[Server host]:host:'
'--port[Server port]:port:'
'--gpu-experts[Number of GPU experts]:count:'
'--cpu-threads[Number of CPU threads]:count:'
'--tensor-parallel-size[Tensor parallel size]:size:'
'--kt-method[KT method]:method:(AMXINT4 FP8 RAWINT4)'
'--attention-backend[Attention backend]:backend:(triton flashinfer)'
'--max-total-tokens[Maximum total tokens]:tokens:'
'--dry-run[Show command without executing]'
'--help[Show help message]'
)
local -a chat_opts
chat_opts=(
'--host[Server host]:host:'
'--port[Server port]:port:'
'--model[Model name]:model:'
'--temperature[Sampling temperature]:temp:'
'--max-tokens[Maximum tokens]:tokens:'
'--system[System prompt]:prompt:'
'--save-history[Save conversation history]'
'--no-save-history[Do not save history]'
'--history-file[History file path]:path:_files'
'--stream[Enable streaming output]'
'--no-stream[Disable streaming output]'
'--help[Show help message]'
)
local -a model_cmds
model_cmds=(
'download:Download a model from HuggingFace'
'list:List available models'
'path-list:List all model storage paths'
'path-add:Add a new model storage path'
'path-remove:Remove a model storage path'
'search:Search for models in the registry'
)
local -a config_cmds
config_cmds=(
'show:Show all configuration'
'get:Get configuration value'
'set:Set configuration value'
'reset:Reset to defaults'
'path:Show configuration file path'
'init:Re-run first-time setup wizard'
)
local -a sft_cmds
sft_cmds=(
'train:Train model'
'chat:Chat with model'
'export:Export model'
)
_arguments -C \
'1: :->command' \
'*::arg:->args'
case $state in
command)
_describe 'kt commands' commands
_arguments \
'--help[Show help message]' \
'--version[Show version]'
;;
args)
case $words[1] in
run)
_arguments $run_opts \
'1:model:'
;;
chat)
_arguments $chat_opts
;;
quant)
_arguments \
'--method[Quantization method]:method:' \
'--output[Output directory]:path:_files -/' \
'--help[Show help message]' \
'1:model:_files -/'
;;
bench|microbench)
_arguments \
'--model[Model name or path]:model:' \
'--config[Config file path]:path:_files' \
'--help[Show help message]'
;;
doctor)
_arguments \
'--verbose[Verbose output]' \
'--help[Show help message]'
;;
model)
_arguments \
'1: :->model_cmd' \
'*::arg:->model_args'
case $state in
model_cmd)
_describe 'model commands' model_cmds
;;
esac
;;
config)
_arguments \
'1: :->config_cmd' \
'*::arg:->config_args'
case $state in
config_cmd)
_describe 'config commands' config_cmds
;;
esac
;;
sft)
_arguments \
'1: :->sft_cmd' \
'*::arg:->sft_args'
case $state in
sft_cmd)
_describe 'sft commands' sft_cmds
;;
esac
;;
esac
;;
esac
}
_kt "$@"

View File

@@ -0,0 +1,73 @@
#!/bin/bash
# Bash completion for kt command
# This is a static completion script that doesn't require Python startup
_kt_completion() {
local cur prev opts
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
prev="${COMP_WORDS[COMP_CWORD-1]}"
# Main commands
local commands="version run chat quant bench microbench doctor model config sft"
# Global options
local global_opts="--help --version"
# Handle subcommands
case "${COMP_CWORD}" in
1)
# First argument: suggest commands and global options
COMPREPLY=( $(compgen -W "${commands} ${global_opts}" -- ${cur}) )
return 0
;;
*)
# Handle specific command options
case "${COMP_WORDS[1]}" in
run)
local run_opts="--host --port --gpu-experts --cpu-threads --tensor-parallel-size --kt-method --attention-backend --max-total-tokens --dry-run --help"
COMPREPLY=( $(compgen -W "${run_opts}" -- ${cur}) )
;;
chat)
local chat_opts="--host --port --model --temperature --max-tokens --system --save-history --no-save-history --history-file --stream --no-stream --help"
COMPREPLY=( $(compgen -W "${chat_opts}" -- ${cur}) )
;;
quant)
local quant_opts="--method --output --help"
COMPREPLY=( $(compgen -W "${quant_opts}" -- ${cur}) )
;;
bench|microbench)
local bench_opts="--model --config --help"
COMPREPLY=( $(compgen -W "${bench_opts}" -- ${cur}) )
;;
doctor)
local doctor_opts="--verbose --help"
COMPREPLY=( $(compgen -W "${doctor_opts}" -- ${cur}) )
;;
model)
local model_cmds="download list path-list path-add path-remove search"
local model_opts="--help"
COMPREPLY=( $(compgen -W "${model_cmds} ${model_opts}" -- ${cur}) )
;;
config)
local config_cmds="show get set reset path init model-path-list model-path-add model-path-remove"
local config_opts="--help"
COMPREPLY=( $(compgen -W "${config_cmds} ${config_opts}" -- ${cur}) )
;;
sft)
local sft_cmds="train chat export"
local sft_opts="--help"
COMPREPLY=( $(compgen -W "${sft_cmds} ${sft_opts}" -- ${cur}) )
;;
version)
COMPREPLY=( $(compgen -W "--help" -- ${cur}) )
;;
*)
COMPREPLY=()
;;
esac
;;
esac
}
complete -F _kt_completion kt

View File

@@ -0,0 +1,74 @@
# Fish completion for kt command
# This is a static completion script that doesn't require Python startup
# Main commands
complete -c kt -f -n "__fish_use_subcommand" -a "version" -d "Show version information"
complete -c kt -f -n "__fish_use_subcommand" -a "run" -d "Start model inference server"
complete -c kt -f -n "__fish_use_subcommand" -a "chat" -d "Interactive chat with running model"
complete -c kt -f -n "__fish_use_subcommand" -a "quant" -d "Quantize model weights"
complete -c kt -f -n "__fish_use_subcommand" -a "bench" -d "Run full benchmark"
complete -c kt -f -n "__fish_use_subcommand" -a "microbench" -d "Run micro-benchmark"
complete -c kt -f -n "__fish_use_subcommand" -a "doctor" -d "Diagnose environment issues"
complete -c kt -f -n "__fish_use_subcommand" -a "model" -d "Manage models and storage paths"
complete -c kt -f -n "__fish_use_subcommand" -a "config" -d "Manage configuration"
complete -c kt -f -n "__fish_use_subcommand" -a "sft" -d "Fine-tuning with LlamaFactory"
# Global options
complete -c kt -l help -d "Show help message"
complete -c kt -l version -d "Show version"
# Run command options
complete -c kt -f -n "__fish_seen_subcommand_from run" -l host -d "Server host"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l port -d "Server port"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l gpu-experts -d "Number of GPU experts"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l cpu-threads -d "Number of CPU threads"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l tensor-parallel-size -d "Tensor parallel size"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l kt-method -d "KT method"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l attention-backend -d "Attention backend"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l max-total-tokens -d "Maximum total tokens"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l dry-run -d "Show command without executing"
# Chat command options
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l host -d "Server host"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l port -d "Server port"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l model -d "Model name"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l temperature -d "Sampling temperature"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l max-tokens -d "Maximum tokens"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l system -d "System prompt"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l save-history -d "Save conversation history"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l no-save-history -d "Do not save history"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l history-file -d "History file path"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l stream -d "Enable streaming output"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l no-stream -d "Disable streaming output"
# Quant command options
complete -c kt -f -n "__fish_seen_subcommand_from quant" -l method -d "Quantization method"
complete -c kt -f -n "__fish_seen_subcommand_from quant" -l output -d "Output directory"
# Bench command options
complete -c kt -f -n "__fish_seen_subcommand_from bench microbench" -l model -d "Model name or path"
complete -c kt -f -n "__fish_seen_subcommand_from bench microbench" -l config -d "Config file path"
# Doctor command options
complete -c kt -f -n "__fish_seen_subcommand_from doctor" -l verbose -d "Verbose output"
# Model subcommands
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "download" -d "Download a model from HuggingFace"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "list" -d "List available models"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "path-list" -d "List all model storage paths"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "path-add" -d "Add a new model storage path"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "path-remove" -d "Remove a model storage path"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "search" -d "Search for models in the registry"
# Config subcommands
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "show" -d "Show all configuration"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "get" -d "Get configuration value"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "set" -d "Set configuration value"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "reset" -d "Reset to defaults"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "path" -d "Show configuration file path"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "init" -d "Re-run first-time setup wizard"
# SFT subcommands
complete -c kt -f -n "__fish_seen_subcommand_from sft; and not __fish_seen_subcommand_from train chat export" -a "train" -d "Train model"
complete -c kt -f -n "__fish_seen_subcommand_from sft; and not __fish_seen_subcommand_from train chat export" -a "chat" -d "Chat with model"
complete -c kt -f -n "__fish_seen_subcommand_from sft; and not __fish_seen_subcommand_from train chat export" -a "export" -d "Export model"

View File

@@ -0,0 +1,7 @@
"""
Configuration management for kt-cli.
"""
from kt_kernel.cli.config.settings import Settings, get_settings
__all__ = ["Settings", "get_settings"]

View File

@@ -0,0 +1,311 @@
"""
Configuration management for kt-cli.
Handles reading and writing configuration from ~/.ktransformers/config.yaml
"""
import os
from pathlib import Path
from typing import Any, Optional
import yaml
# Default configuration directory
DEFAULT_CONFIG_DIR = Path.home() / ".ktransformers"
DEFAULT_CONFIG_FILE = DEFAULT_CONFIG_DIR / "config.yaml"
DEFAULT_MODELS_DIR = DEFAULT_CONFIG_DIR / "models"
DEFAULT_CACHE_DIR = DEFAULT_CONFIG_DIR / "cache"
# Default configuration values
DEFAULT_CONFIG = {
"general": {
"language": "auto", # auto, en, zh
"color": True,
"verbose": False,
},
"paths": {
"models": str(DEFAULT_MODELS_DIR),
"cache": str(DEFAULT_CACHE_DIR),
"weights": "", # Custom quantized weights path
},
"server": {
"host": "0.0.0.0",
"port": 30000,
},
"inference": {
# Inference parameters are model-specific and should not have defaults
# They will be auto-detected or use model-specific optimizations
# Environment variables (general optimizations)
"env": {
"PYTORCH_ALLOC_CONF": "expandable_segments:True",
"SGLANG_ENABLE_JIT_DEEPGEMM": "0",
},
},
"download": {
"mirror": "", # HuggingFace mirror URL
"resume": True,
"verify": True,
},
"advanced": {
# Environment variables to set when running
"env": {},
# Extra arguments to pass to sglang
"sglang_args": [],
# Extra arguments to pass to llamafactory
"llamafactory_args": [],
},
"dependencies": {
# SGLang installation source configuration
"sglang": {
"source": "github", # "pypi" or "github"
"repo": "https://github.com/kvcache-ai/sglang",
"branch": "main",
},
},
}
class Settings:
"""Configuration manager for kt-cli."""
def __init__(self, config_path: Optional[Path] = None):
"""Initialize settings manager.
Args:
config_path: Path to config file. Defaults to ~/.ktransformers/config.yaml
"""
self.config_path = config_path or DEFAULT_CONFIG_FILE
self.config_dir = self.config_path.parent
self._config: dict[str, Any] = {}
self._load()
def _ensure_dirs(self) -> None:
"""Ensure configuration directories exist."""
self.config_dir.mkdir(parents=True, exist_ok=True)
# Ensure all model paths exist
model_paths = self.get_model_paths()
for path in model_paths:
path.mkdir(parents=True, exist_ok=True)
Path(self.get("paths.cache", DEFAULT_CACHE_DIR)).mkdir(parents=True, exist_ok=True)
def _load(self) -> None:
"""Load configuration from file."""
self._config = self._deep_copy(DEFAULT_CONFIG)
if self.config_path.exists():
try:
with open(self.config_path, "r", encoding="utf-8") as f:
user_config = yaml.safe_load(f) or {}
self._deep_merge(self._config, user_config)
except (yaml.YAMLError, OSError) as e:
# Log warning but continue with defaults
print(f"Warning: Failed to load config: {e}")
self._ensure_dirs()
def _save(self) -> None:
"""Save configuration to file."""
self._ensure_dirs()
try:
with open(self.config_path, "w", encoding="utf-8") as f:
yaml.dump(self._config, f, default_flow_style=False, allow_unicode=True)
except OSError as e:
raise RuntimeError(f"Failed to save config: {e}")
def _deep_copy(self, obj: Any) -> Any:
"""Create a deep copy of a nested dict."""
if isinstance(obj, dict):
return {k: self._deep_copy(v) for k, v in obj.items()}
if isinstance(obj, list):
return [self._deep_copy(item) for item in obj]
return obj
def _deep_merge(self, base: dict, override: dict) -> None:
"""Deep merge override into base."""
for key, value in override.items():
if key in base and isinstance(base[key], dict) and isinstance(value, dict):
self._deep_merge(base[key], value)
else:
base[key] = value
def get(self, key: str, default: Any = None) -> Any:
"""Get a configuration value by dot-separated key.
Args:
key: Dot-separated key path (e.g., "server.port")
default: Default value if key not found
Returns:
Configuration value or default
"""
parts = key.split(".")
value = self._config
for part in parts:
if isinstance(value, dict) and part in value:
value = value[part]
else:
return default
return value
def set(self, key: str, value: Any) -> None:
"""Set a configuration value by dot-separated key.
Args:
key: Dot-separated key path (e.g., "server.port")
value: Value to set
"""
parts = key.split(".")
config = self._config
# Navigate to parent
for part in parts[:-1]:
if part not in config:
config[part] = {}
config = config[part]
# Set value
config[parts[-1]] = value
self._save()
def delete(self, key: str) -> bool:
"""Delete a configuration value.
Args:
key: Dot-separated key path
Returns:
True if key was deleted, False if not found
"""
parts = key.split(".")
config = self._config
# Navigate to parent
for part in parts[:-1]:
if part not in config:
return False
config = config[part]
# Delete key
if parts[-1] in config:
del config[parts[-1]]
self._save()
return True
return False
def reset(self) -> None:
"""Reset configuration to defaults."""
self._config = self._deep_copy(DEFAULT_CONFIG)
self._save()
def get_all(self) -> dict[str, Any]:
"""Get all configuration values."""
return self._deep_copy(self._config)
def get_env_vars(self) -> dict[str, str]:
"""Get environment variables to set."""
env_vars = {}
# Get from advanced.env
advanced_env = self.get("advanced.env", {})
if isinstance(advanced_env, dict):
env_vars.update({k: str(v) for k, v in advanced_env.items()})
return env_vars
@property
def models_dir(self) -> Path:
"""Get the primary models directory path (for backward compatibility)."""
paths = self.get_model_paths()
return paths[0] if paths else Path(DEFAULT_MODELS_DIR)
def get_model_paths(self) -> list[Path]:
"""Get all model directory paths.
Returns a list of Path objects. Supports both:
- Single path: paths.models = "/path/to/models"
- Multiple paths: paths.models = ["/path/1", "/path/2"]
"""
models_config = self.get("paths.models", DEFAULT_MODELS_DIR)
# Handle both string and list
if isinstance(models_config, str):
return [Path(models_config)]
elif isinstance(models_config, list):
return [Path(p) for p in models_config]
else:
return [Path(DEFAULT_MODELS_DIR)]
def add_model_path(self, path: str) -> None:
"""Add a new model path to the configuration."""
models_config = self.get("paths.models", DEFAULT_MODELS_DIR)
# Convert to list if it's a string
if isinstance(models_config, str):
paths = [models_config]
elif isinstance(models_config, list):
paths = list(models_config)
else:
paths = []
# Add new path if not already present
if path not in paths:
paths.append(path)
self.set("paths.models", paths)
def remove_model_path(self, path: str) -> bool:
"""Remove a model path from the configuration.
Returns True if path was removed, False if not found.
"""
models_config = self.get("paths.models", DEFAULT_MODELS_DIR)
if isinstance(models_config, str):
# Can't remove if it's a single string
if models_config == path:
# Don't remove the last path
return False
return False
elif isinstance(models_config, list):
if path in models_config:
paths = list(models_config)
paths.remove(path)
# Don't allow removing all paths
if not paths:
return False
self.set("paths.models", paths if len(paths) > 1 else paths[0])
return True
return False
@property
def cache_dir(self) -> Path:
"""Get the cache directory path."""
return Path(self.get("paths.cache", DEFAULT_CACHE_DIR))
@property
def weights_dir(self) -> Optional[Path]:
"""Get the custom weights directory path."""
weights = self.get("paths.weights", "")
return Path(weights) if weights else None
# Global settings instance
_settings: Optional[Settings] = None
def get_settings() -> Settings:
"""Get the global settings instance."""
global _settings
if _settings is None:
_settings = Settings()
return _settings
def reset_settings() -> None:
"""Reset the global settings instance."""
global _settings
_settings = None

View File

@@ -0,0 +1,655 @@
"""
Internationalization (i18n) module for kt-cli.
Supports English and Chinese languages, with automatic detection based on
system locale or KT_LANG environment variable.
"""
import os
from typing import Any
# Message definitions for all supported languages
MESSAGES: dict[str, dict[str, str]] = {
"en": {
# General
"welcome": "Welcome to KTransformers!",
"goodbye": "Goodbye!",
"error": "Error",
"warning": "Warning",
"success": "Success",
"info": "Info",
"yes": "Yes",
"no": "No",
"cancel": "Cancel",
"confirm": "Confirm",
"done": "Done",
"failed": "Failed",
"skip": "Skip",
"back": "Back",
"next": "Next",
"retry": "Retry",
"abort": "Abort",
# Version command
"version_info": "KTransformers CLI",
"version_python": "Python",
"version_platform": "Platform",
"version_cuda": "CUDA",
"version_cuda_not_found": "Not found",
"version_kt_kernel": "kt-kernel",
"version_ktransformers": "ktransformers",
"version_sglang": "sglang",
"version_llamafactory": "llamafactory",
"version_not_installed": "Not installed",
# Install command
"install_detecting_env": "Detecting environment managers...",
"install_found": "Found {name} (version {version})",
"install_not_found": "Not found: {name}",
"install_checking_env": "Checking existing environments...",
"install_env_exists": "Found existing 'kt' environment",
"install_env_not_exists": "No 'kt' environment found",
"install_no_env_manager": "No virtual environment manager detected",
"install_select_method": "Please select installation method:",
"install_method_conda": "Create new conda environment 'kt' (Recommended)",
"install_method_venv": "Create new venv environment",
"install_method_uv": "Create new uv environment (Fast)",
"install_method_docker": "Use Docker container",
"install_method_system": "Install to system Python (Not recommended)",
"install_select_mode": "Please select installation mode:",
"install_mode_inference": "Inference - Install kt-kernel + SGLang",
"install_mode_sft": "Training - Install kt-sft + LlamaFactory",
"install_mode_full": "Full - Install all components",
"install_creating_env": "Creating {type} environment '{name}'...",
"install_env_created": "Environment created successfully",
"install_installing_deps": "Installing dependencies...",
"install_checking_deps": "Checking dependency versions...",
"install_dep_ok": "OK",
"install_dep_outdated": "Needs update",
"install_dep_missing": "Missing",
"install_installing_pytorch": "Installing PyTorch...",
"install_installing_from_requirements": "Installing from requirements file...",
"install_deps_outdated": "Found {count} package(s) that need updating. Continue?",
"install_updating": "Updating packages...",
"install_complete": "Installation complete!",
"install_activate_hint": "Activate environment: {command}",
"install_start_hint": "Get started: kt run --help",
"install_docker_pulling": "Pulling Docker image...",
"install_docker_complete": "Docker image ready!",
"install_docker_run_hint": "Run with: docker run --gpus all -p 30000:30000 {image} kt run {model}",
"install_in_venv": "Running in virtual environment: {name}",
"install_continue_without_venv": "Continue installing to system Python?",
"install_already_installed": "All dependencies are already installed!",
"install_confirm": "Install {count} package(s)?",
# Install - System dependencies
"install_checking_system_deps": "Checking system dependencies...",
"install_dep_name": "Dependency",
"install_dep_status": "Status",
"install_deps_all_installed": "All system dependencies are installed",
"install_deps_install_prompt": "Install missing dependencies?",
"install_installing_system_deps": "Installing system dependencies...",
"install_installing_dep": "Installing {name}",
"install_dep_no_install_cmd": "No install command available for {name} on {os}",
"install_dep_install_failed": "Failed to install {name}",
"install_deps_skipped": "Skipping dependency installation",
"install_deps_failed": "Failed to install system dependencies",
# Install - CPU detection
"install_auto_detect_cpu": "Auto-detecting CPU capabilities...",
"install_cpu_features": "Detected CPU features: {features}",
"install_cpu_no_features": "No advanced CPU features detected",
# Install - Build configuration
"install_build_config": "Build Configuration:",
"install_native_warning": "Note: Binary optimized for THIS CPU only (not portable)",
"install_building_from_source": "Building kt-kernel from source...",
"install_build_failed": "Build failed",
"install_build_success": "Build completed successfully",
# Install - Verification
"install_verifying": "Verifying installation...",
"install_verify_success": "kt-kernel {version} ({variant} variant) installed successfully",
"install_verify_failed": "Verification failed: {error}",
# Install - Docker
"install_docker_guide_title": "Docker Installation",
"install_docker_guide_desc": "For Docker installation, please refer to the official guide:",
# Config command
"config_show_title": "Current Configuration",
"config_set_success": "Configuration updated: {key} = {value}",
"config_get_value": "{key} = {value}",
"config_get_not_found": "Configuration key '{key}' not found",
"config_reset_confirm": "This will reset all configurations to default. Continue?",
"config_reset_success": "Configuration reset to default",
"config_file_location": "Configuration file: {path}",
# Doctor command
"doctor_title": "KTransformers Environment Diagnostics",
"doctor_checking": "Running diagnostics...",
"doctor_check_python": "Python version",
"doctor_check_cuda": "CUDA availability",
"doctor_check_gpu": "GPU detection",
"doctor_check_cpu": "CPU",
"doctor_check_cpu_isa": "CPU Instructions",
"doctor_check_numa": "NUMA Topology",
"doctor_check_memory": "System memory",
"doctor_check_disk": "Disk space",
"doctor_check_packages": "Required packages",
"doctor_check_env": "Environment variables",
"doctor_status_ok": "OK",
"doctor_status_warning": "Warning",
"doctor_status_error": "Error",
"doctor_gpu_found": "Found {count} GPU(s): {names}",
"doctor_gpu_not_found": "No GPU detected",
"doctor_cpu_info": "{name} ({cores} cores / {threads} threads)",
"doctor_cpu_isa_info": "{isa_list}",
"doctor_cpu_isa_missing": "Missing recommended: {missing}",
"doctor_numa_info": "{nodes} node(s)",
"doctor_numa_detail": "{node}: CPUs {cpus}",
"doctor_memory_info": "{available} available / {total} total",
"doctor_memory_freq": "{available} available / {total} total ({freq}MHz {type})",
"doctor_disk_info": "{available} available at {path}",
"doctor_all_ok": "All checks passed! Your environment is ready.",
"doctor_has_issues": "Some issues were found. Please review the warnings/errors above.",
# Run command
"run_detecting_hardware": "Detecting hardware configuration...",
"run_gpu_info": "GPU: {name} ({vram}GB VRAM)",
"run_cpu_info": "CPU: {name} ({cores} cores, {numa} NUMA nodes)",
"run_ram_info": "RAM: {total}GB",
"run_checking_model": "Checking model status...",
"run_model_path": "Model path: {path}",
"run_weights_not_found": "Quantized weights not found",
"run_quant_prompt": "Quantize model now? (This may take a while)",
"run_quantizing": "Quantizing model...",
"run_starting_server": "Starting server...",
"run_server_mode": "Mode: SGLang + kt-kernel",
"run_server_port": "Port: {port}",
"run_gpu_experts": "GPU experts: {count}/layer",
"run_cpu_threads": "CPU threads: {count}",
"run_server_started": "Server started!",
"run_api_url": "API URL: http://{host}:{port}",
"run_docs_url": "Docs URL: http://{host}:{port}/docs",
"run_stop_hint": "Press Ctrl+C to stop the server",
"run_model_not_found": "Model '{name}' not found. Run 'kt download' first.",
"run_multiple_matches": "Multiple models found. Please select:",
"run_select_model": "Select model",
"run_select_model_title": "Select a model to run",
"run_select_model_prompt": "Enter number",
"run_local_models": "Local Models (Downloaded)",
"run_registered_models": "Registered Models",
# Download command
"download_list_title": "Available Models",
"download_searching": "Searching for model '{name}'...",
"download_found": "Found: {name}",
"download_multiple_found": "Multiple matches found:",
"download_select": "Select model to download:",
"download_destination": "Destination: {path}",
"download_starting": "Starting download...",
"download_progress": "Downloading {name}...",
"download_complete": "Download complete!",
"download_already_exists": "Model already exists at {path}",
"download_overwrite_prompt": "Overwrite existing files?",
# Quant command
"quant_input_path": "Input path: {path}",
"quant_output_path": "Output path: {path}",
"quant_method": "Quantization method: {method}",
"quant_starting": "Starting quantization...",
"quant_progress": "Quantizing...",
"quant_complete": "Quantization complete!",
"quant_input_not_found": "Input model not found at {path}",
# SFT command
"sft_mode_train": "Training mode",
"sft_mode_chat": "Chat mode",
"sft_mode_export": "Export mode",
"sft_config_path": "Config file: {path}",
"sft_starting": "Starting {mode}...",
"sft_complete": "{mode} complete!",
"sft_config_not_found": "Config file not found: {path}",
# Bench command
"bench_starting": "Starting benchmark...",
"bench_type": "Benchmark type: {type}",
"bench_complete": "Benchmark complete!",
"bench_results_title": "Benchmark Results",
# Common prompts
"prompt_continue": "Continue?",
"prompt_select": "Please select:",
"prompt_enter_value": "Enter value:",
"prompt_confirm_action": "Confirm this action?",
# First-run setup - Model path selection
"setup_model_path_title": "Model Storage Location",
"setup_model_path_desc": "LLM models are large (50-200GB+). Please select a storage location with sufficient space:",
"setup_scanning_disks": "Scanning available storage locations...",
"setup_disk_option": "{path} ({available} available / {total} total)",
"setup_disk_option_recommended": "{path} ({available} available / {total} total) [Recommended]",
"setup_custom_path": "Enter custom path",
"setup_enter_custom_path": "Enter the path for model storage",
"setup_path_not_exist": "Path does not exist. Create it?",
"setup_path_no_write": "No write permission for this path. Please choose another.",
"setup_path_low_space": "Warning: Less than 100GB available. Large models may not fit.",
"setup_model_path_set": "Model storage path set to: {path}",
"setup_no_large_disk": "No large storage locations found. Using default path.",
"setup_scanning_models": "Scanning for existing models...",
"setup_found_models": "Found {count} model(s):",
"setup_model_info": "{name} ({size}, {type})",
"setup_no_models_found": "No existing models found in this location.",
"setup_location_has_models": "{count} model(s) found",
"setup_installing_completion": "Installing shell completion for {shell}...",
"setup_completion_installed": "Shell completion installed! Restart terminal to enable.",
"setup_completion_failed": "Failed to install shell completion. Run 'kt --install-completion' manually.",
# Auto completion
"completion_installed_title": "Tab Completion",
"completion_installed_for": "Shell completion installed for {shell}",
"completion_activate_now": "To enable completion in this terminal session, run:",
"completion_next_session": "Completion will be automatically enabled in new terminal sessions.",
# SGLang
"sglang_not_found": "SGLang not found",
"sglang_pypi_warning": "SGLang from PyPI may not be compatible with kt-kernel",
"sglang_pypi_hint": 'SGLang from PyPI may not be compatible. Install from source: git clone https://github.com/kvcache-ai/sglang && cd sglang && pip install -e "python[all]"',
"sglang_install_hint": 'Install SGLang: git clone https://github.com/kvcache-ai/sglang && cd sglang && pip install -e "python[all]"',
"sglang_recommend_source": 'Recommend reinstalling from source: git clone https://github.com/kvcache-ai/sglang && cd sglang && pip install -e "python[all]"',
"sglang_kt_kernel_not_supported": "SGLang does not support kt-kernel (missing --kt-gpu-prefill-token-threshold parameter)",
"sglang_checking_kt_kernel_support": "Checking SGLang kt-kernel support...",
"sglang_kt_kernel_supported": "SGLang kt-kernel support verified",
# Chat
"chat_proxy_detected": "Proxy detected in environment",
"chat_proxy_confirm": "Use proxy for connection?",
"chat_proxy_disabled": "Proxy disabled for this session",
# Model command
"model_supported_title": "KTransformers Supported Models",
"model_column_model": "Model",
"model_column_status": "Status",
"model_column_local_path": "Local Path",
"model_status_local": "Local",
"model_status_not_downloaded": "Not downloaded",
"model_usage_title": "Usage",
"model_usage_download": "Download a model:",
"model_usage_list_local": "List local models:",
"model_usage_search": "Search models:",
"model_storage_paths_title": "Model Storage Paths",
"model_local_models_title": "Locally Downloaded Models",
"model_available_models_title": "Available Models",
"model_no_local_models": "No locally downloaded models found",
"model_download_hint": "Download a model with:",
"model_download_usage_hint": "Usage: kt model download <model-name>",
"model_download_list_hint": "Use 'kt model download --list' to see available models.",
"model_download_hf_hint": "Or specify a HuggingFace repo directly: kt model download org/model-name",
"model_saved_to": "Model saved to: {path}",
"model_start_with": "Start with: kt run {name}",
"model_download_failed": "Download failed: {error}",
"model_hf_cli_not_found": "huggingface-cli not found. Install with: pip install huggingface-hub",
"model_path_not_exist": "Path does not exist: {path}",
"model_create_directory": "Create directory {path}?",
"model_created_directory": "Created directory: {path}",
"model_create_dir_failed": "Failed to create directory: {error}",
"model_path_added": "Added model path: {path}",
"model_path_removed": "Removed model path: {path}",
"model_path_not_found": "Path not found in configuration or cannot remove last path: {path}",
"model_search_no_results": "No models found matching '{query}'",
"model_search_results_title": "Search Results for '{query}'",
"model_column_name": "Name",
"model_column_hf_repo": "HuggingFace Repo",
"model_column_aliases": "Aliases",
# Coming soon
"feature_coming_soon": "This feature is coming soon...",
},
"zh": {
# General
"welcome": "欢迎使用 KTransformers",
"goodbye": "再见!",
"error": "错误",
"warning": "警告",
"success": "成功",
"info": "信息",
"yes": "",
"no": "",
"cancel": "取消",
"confirm": "确认",
"done": "完成",
"failed": "失败",
"skip": "跳过",
"back": "返回",
"next": "下一步",
"retry": "重试",
"abort": "中止",
# Version command
"version_info": "KTransformers CLI",
"version_python": "Python",
"version_platform": "平台",
"version_cuda": "CUDA",
"version_cuda_not_found": "未找到",
"version_kt_kernel": "kt-kernel",
"version_ktransformers": "ktransformers",
"version_sglang": "sglang",
"version_llamafactory": "llamafactory",
"version_not_installed": "未安装",
# Install command
"install_detecting_env": "检测环境管理工具...",
"install_found": "发现 {name} (版本 {version})",
"install_not_found": "未找到: {name}",
"install_checking_env": "检查现有环境...",
"install_env_exists": "发现现有 'kt' 环境",
"install_env_not_exists": "未发现 'kt' 环境",
"install_no_env_manager": "未检测到虚拟环境管理工具",
"install_select_method": "请选择安装方式:",
"install_method_conda": "创建新的 conda 环境 'kt' (推荐)",
"install_method_venv": "创建新的 venv 环境",
"install_method_uv": "创建新的 uv 环境 (快速)",
"install_method_docker": "使用 Docker 容器",
"install_method_system": "安装到系统 Python (不推荐)",
"install_select_mode": "请选择安装模式:",
"install_mode_inference": "推理模式 - 安装 kt-kernel + SGLang",
"install_mode_sft": "训练模式 - 安装 kt-sft + LlamaFactory",
"install_mode_full": "完整安装 - 安装所有组件",
"install_creating_env": "正在创建 {type} 环境 '{name}'...",
"install_env_created": "环境创建成功",
"install_installing_deps": "正在安装依赖...",
"install_checking_deps": "检查依赖版本...",
"install_dep_ok": "正常",
"install_dep_outdated": "需更新",
"install_dep_missing": "缺失",
"install_installing_pytorch": "正在安装 PyTorch...",
"install_installing_from_requirements": "从依赖文件安装...",
"install_deps_outdated": "发现 {count} 个包需要更新,是否继续?",
"install_updating": "正在更新包...",
"install_complete": "安装完成!",
"install_activate_hint": "激活环境: {command}",
"install_start_hint": "开始使用: kt run --help",
"install_docker_pulling": "正在拉取 Docker 镜像...",
"install_docker_complete": "Docker 镜像已就绪!",
"install_docker_run_hint": "运行: docker run --gpus all -p 30000:30000 {image} kt run {model}",
"install_in_venv": "当前在虚拟环境中: {name}",
"install_continue_without_venv": "继续安装到系统 Python",
"install_already_installed": "所有依赖已安装!",
"install_confirm": "安装 {count} 个包?",
# Install - System dependencies
"install_checking_system_deps": "检查系统依赖...",
"install_dep_name": "依赖项",
"install_dep_status": "状态",
"install_deps_all_installed": "所有系统依赖已安装",
"install_deps_install_prompt": "是否安装缺失的依赖?",
"install_installing_system_deps": "正在安装系统依赖...",
"install_installing_dep": "正在安装 {name}",
"install_dep_no_install_cmd": "{os} 系统上没有 {name} 的安装命令",
"install_dep_install_failed": "安装 {name} 失败",
"install_deps_skipped": "跳过依赖安装",
"install_deps_failed": "系统依赖安装失败",
# Install - CPU detection
"install_auto_detect_cpu": "正在自动检测 CPU 能力...",
"install_cpu_features": "检测到的 CPU 特性: {features}",
"install_cpu_no_features": "未检测到高级 CPU 特性",
# Install - Build configuration
"install_build_config": "构建配置:",
"install_native_warning": "注意: 二进制文件仅针对当前 CPU 优化(不可移植)",
"install_building_from_source": "正在从源码构建 kt-kernel...",
"install_build_failed": "构建失败",
"install_build_success": "构建成功",
# Install - Verification
"install_verifying": "正在验证安装...",
"install_verify_success": "kt-kernel {version} ({variant} 变体) 安装成功",
"install_verify_failed": "验证失败: {error}",
# Install - Docker
"install_docker_guide_title": "Docker 安装",
"install_docker_guide_desc": "有关 Docker 安装,请参阅官方指南:",
# Config command
"config_show_title": "当前配置",
"config_set_success": "配置已更新: {key} = {value}",
"config_get_value": "{key} = {value}",
"config_get_not_found": "未找到配置项 '{key}'",
"config_reset_confirm": "这将重置所有配置为默认值。是否继续?",
"config_reset_success": "配置已重置为默认值",
"config_file_location": "配置文件: {path}",
# Doctor command
"doctor_title": "KTransformers 环境诊断",
"doctor_checking": "正在运行诊断...",
"doctor_check_python": "Python 版本",
"doctor_check_cuda": "CUDA 可用性",
"doctor_check_gpu": "GPU 检测",
"doctor_check_cpu": "CPU",
"doctor_check_cpu_isa": "CPU 指令集",
"doctor_check_numa": "NUMA 拓扑",
"doctor_check_memory": "系统内存",
"doctor_check_disk": "磁盘空间",
"doctor_check_packages": "必需的包",
"doctor_check_env": "环境变量",
"doctor_status_ok": "正常",
"doctor_status_warning": "警告",
"doctor_status_error": "错误",
"doctor_gpu_found": "发现 {count} 个 GPU: {names}",
"doctor_gpu_not_found": "未检测到 GPU",
"doctor_cpu_info": "{name} ({cores} 核心 / {threads} 线程)",
"doctor_cpu_isa_info": "{isa_list}",
"doctor_cpu_isa_missing": "缺少推荐指令集: {missing}",
"doctor_numa_info": "{nodes} 个节点",
"doctor_numa_detail": "{node}: CPU {cpus}",
"doctor_memory_info": "{available} 可用 / {total} 总计",
"doctor_memory_freq": "{available} 可用 / {total} 总计 ({freq}MHz {type})",
"doctor_disk_info": "{path}{available} 可用空间",
"doctor_all_ok": "所有检查通过!您的环境已就绪。",
"doctor_has_issues": "发现一些问题,请查看上方的警告/错误信息。",
# Run command
"run_detecting_hardware": "检测硬件配置...",
"run_gpu_info": "GPU: {name} ({vram}GB 显存)",
"run_cpu_info": "CPU: {name} ({cores} 核心, {numa} NUMA 节点)",
"run_ram_info": "内存: {total}GB",
"run_checking_model": "检查模型状态...",
"run_model_path": "模型路径: {path}",
"run_weights_not_found": "未找到量化权重",
"run_quant_prompt": "是否现在量化模型?(这可能需要一些时间)",
"run_quantizing": "正在量化模型...",
"run_starting_server": "正在启动服务器...",
"run_server_mode": "模式: SGLang + kt-kernel",
"run_server_port": "端口: {port}",
"run_gpu_experts": "GPU 专家: {count}/层",
"run_cpu_threads": "CPU 线程: {count}",
"run_server_started": "服务器已启动!",
"run_api_url": "API 地址: http://{host}:{port}",
"run_docs_url": "文档地址: http://{host}:{port}/docs",
"run_stop_hint": "按 Ctrl+C 停止服务器",
"run_model_not_found": "未找到模型 '{name}'。请先运行 'kt download'",
"run_multiple_matches": "找到多个匹配的模型,请选择:",
"run_select_model": "选择模型",
"run_select_model_title": "选择要运行的模型",
"run_select_model_prompt": "输入编号",
"run_local_models": "本地模型 (已下载)",
"run_registered_models": "注册模型",
# Download command
"download_list_title": "可用模型",
"download_searching": "正在搜索模型 '{name}'...",
"download_found": "找到: {name}",
"download_multiple_found": "找到多个匹配:",
"download_select": "选择要下载的模型:",
"download_destination": "目标路径: {path}",
"download_starting": "开始下载...",
"download_progress": "正在下载 {name}...",
"download_complete": "下载完成!",
"download_already_exists": "模型已存在于 {path}",
"download_overwrite_prompt": "是否覆盖现有文件?",
# Quant command
"quant_input_path": "输入路径: {path}",
"quant_output_path": "输出路径: {path}",
"quant_method": "量化方法: {method}",
"quant_starting": "开始量化...",
"quant_progress": "正在量化...",
"quant_complete": "量化完成!",
"quant_input_not_found": "未找到输入模型: {path}",
# SFT command
"sft_mode_train": "训练模式",
"sft_mode_chat": "聊天模式",
"sft_mode_export": "导出模式",
"sft_config_path": "配置文件: {path}",
"sft_starting": "正在启动 {mode}...",
"sft_complete": "{mode} 完成!",
"sft_config_not_found": "未找到配置文件: {path}",
# Bench command
"bench_starting": "开始基准测试...",
"bench_type": "测试类型: {type}",
"bench_complete": "基准测试完成!",
"bench_results_title": "基准测试结果",
# Common prompts
"prompt_continue": "是否继续?",
"prompt_select": "请选择:",
"prompt_enter_value": "请输入:",
"prompt_confirm_action": "确认此操作?",
# First-run setup - Model path selection
"setup_model_path_title": "模型存储位置",
"setup_model_path_desc": "大语言模型体积较大50-200GB+)。请选择一个有足够空间的存储位置:",
"setup_scanning_disks": "正在扫描可用存储位置...",
"setup_disk_option": "{path} (可用 {available} / 总共 {total})",
"setup_disk_option_recommended": "{path} (可用 {available} / 总共 {total}) [推荐]",
"setup_custom_path": "输入自定义路径",
"setup_enter_custom_path": "请输入模型存储路径",
"setup_path_not_exist": "路径不存在,是否创建?",
"setup_path_no_write": "没有该路径的写入权限,请选择其他路径。",
"setup_path_low_space": "警告:可用空间不足 100GB可能无法存储大型模型。",
"setup_model_path_set": "模型存储路径已设置为: {path}",
"setup_no_large_disk": "未发现大容量存储位置,使用默认路径。",
"setup_scanning_models": "正在扫描已有模型...",
"setup_found_models": "发现 {count} 个模型:",
"setup_model_info": "{name} ({size}, {type})",
"setup_no_models_found": "该位置未发现已有模型。",
"setup_location_has_models": "发现 {count} 个模型",
"setup_installing_completion": "正在为 {shell} 安装命令补全...",
"setup_completion_installed": "命令补全已安装!重启终端后生效。",
"setup_completion_failed": "命令补全安装失败。请手动运行 'kt --install-completion'",
# Auto completion
"completion_installed_title": "命令补全",
"completion_installed_for": "已为 {shell} 安装命令补全",
"completion_activate_now": "在当前终端会话中启用补全,请运行:",
"completion_next_session": "新的终端会话将自动启用补全。",
# SGLang
"sglang_not_found": "未找到 SGLang",
"sglang_pypi_warning": "PyPI 版本的 SGLang 可能与 kt-kernel 不兼容",
"sglang_pypi_hint": 'PyPI 版本可能不兼容。从源码安装: git clone https://github.com/kvcache-ai/sglang && cd sglang && pip install -e "python[all]"',
"sglang_install_hint": '安装 SGLang: git clone https://github.com/kvcache-ai/sglang && cd sglang && pip install -e "python[all]"',
"sglang_recommend_source": '建议从源码重新安装: git clone https://github.com/kvcache-ai/sglang && cd sglang && pip install -e "python[all]"',
"sglang_kt_kernel_not_supported": "SGLang 不支持 kt-kernel (缺少 --kt-gpu-prefill-token-threshold 参数)",
"sglang_checking_kt_kernel_support": "正在检查 SGLang kt-kernel 支持...",
"sglang_kt_kernel_supported": "SGLang kt-kernel 支持已验证",
# Chat
"chat_proxy_detected": "检测到环境中存在代理设置",
"chat_proxy_confirm": "是否使用代理连接?",
"chat_proxy_disabled": "已在本次会话中禁用代理",
# Model command
"model_supported_title": "KTransformers 支持的模型",
"model_column_model": "模型",
"model_column_status": "状态",
"model_column_local_path": "本地路径",
"model_status_local": "本地",
"model_status_not_downloaded": "未下载",
"model_usage_title": "使用方法",
"model_usage_download": "下载模型:",
"model_usage_list_local": "列出本地模型:",
"model_usage_search": "搜索模型:",
"model_storage_paths_title": "模型存储路径",
"model_local_models_title": "本地已下载的模型",
"model_available_models_title": "可用模型",
"model_no_local_models": "未找到本地已下载的模型",
"model_download_hint": "下载模型:",
"model_download_usage_hint": "用法: kt model download <模型名称>",
"model_download_list_hint": "使用 'kt model download --list' 查看可用模型。",
"model_download_hf_hint": "或直接指定 HuggingFace 仓库: kt model download org/model-name",
"model_saved_to": "模型已保存到: {path}",
"model_start_with": "启动命令: kt run {name}",
"model_download_failed": "下载失败: {error}",
"model_hf_cli_not_found": "未找到 huggingface-cli。请安装: pip install huggingface-hub",
"model_path_not_exist": "路径不存在: {path}",
"model_create_directory": "创建目录 {path}",
"model_created_directory": "已创建目录: {path}",
"model_create_dir_failed": "创建目录失败: {error}",
"model_path_added": "已添加模型路径: {path}",
"model_path_removed": "已移除模型路径: {path}",
"model_path_not_found": "路径未找到或无法移除最后一个路径: {path}",
"model_search_no_results": "未找到匹配 '{query}' 的模型",
"model_search_results_title": "'{query}' 的搜索结果",
"model_column_name": "名称",
"model_column_hf_repo": "HuggingFace 仓库",
"model_column_aliases": "别名",
# Coming soon
"feature_coming_soon": "此功能即将推出...",
},
}
# Cache for language detection to avoid repeated I/O
_lang_cache: str | None = None
def get_lang() -> str:
"""
Detect the current language setting.
Priority:
1. KT_LANG environment variable
2. Config file general.language setting
3. LANG environment variable (if config is "auto")
4. Default to English
Returns:
Language code: "zh" for Chinese, "en" for English
"""
global _lang_cache
# 1. Check KT_LANG environment variable (highest priority)
kt_lang = os.environ.get("KT_LANG", "").lower()
if kt_lang:
return "zh" if kt_lang.startswith("zh") else "en"
# 2. Return cached value if available (avoids I/O on every call)
if _lang_cache is not None:
return _lang_cache
# 3. Check config file setting (with caching)
# Import here to avoid circular imports
from kt_kernel.cli.config.settings import get_settings
try:
settings = get_settings()
config_lang = settings.get("general.language", "auto")
if config_lang and config_lang != "auto":
lang = "zh" if config_lang.lower().startswith("zh") else "en"
_lang_cache = lang
return lang
except Exception:
# If settings fail to load, continue with system detection
pass
# 4. Check system LANG environment variable
system_lang = os.environ.get("LANG", "").lower()
lang = "zh" if system_lang.startswith("zh") else "en"
_lang_cache = lang
return lang
def t(msg_key: str, **kwargs: Any) -> str:
"""
Translate a message key to the current language.
Args:
msg_key: Message key to translate
**kwargs: Format arguments for the message
Returns:
Translated and formatted message string
Example:
>>> t("welcome")
"Welcome to KTransformers!" # or "欢迎使用 KTransformers" in Chinese
>>> t("install_found", name="conda", version="24.1.0")
"Found conda (version 24.1.0)"
"""
lang = get_lang()
messages = MESSAGES.get(lang, MESSAGES["en"])
message = messages.get(msg_key, MESSAGES["en"].get(msg_key, msg_key))
if kwargs:
try:
return message.format(**kwargs)
except KeyError:
return message
return message
def set_lang(lang: str) -> None:
"""
Set the language for the current session.
Args:
lang: Language code ("en" or "zh")
"""
global _lang_cache
os.environ["KT_LANG"] = lang
_lang_cache = lang # Update cache when language is explicitly set

View File

@@ -0,0 +1,436 @@
"""
Main entry point for kt-cli.
KTransformers CLI - A unified command-line interface for KTransformers.
"""
import sys
import typer
from kt_kernel.cli import __version__
from kt_kernel.cli.commands import bench, chat, config, doctor, model, quant, run, sft, version
from kt_kernel.cli.i18n import t, set_lang, get_lang
def _get_app_help() -> str:
"""Get app help text based on current language."""
lang = get_lang()
if lang == "zh":
return "KTransformers CLI - KTransformers 统一命令行界面"
return "KTransformers CLI - A unified command-line interface for KTransformers."
def _get_help(key: str) -> str:
"""Get help text based on current language."""
help_texts = {
"version": {"en": "Show version information", "zh": "显示版本信息"},
"run": {"en": "Start model inference server", "zh": "启动模型推理服务器"},
"chat": {"en": "Interactive chat with running model", "zh": "与运行中的模型进行交互式聊天"},
"quant": {"en": "Quantize model weights", "zh": "量化模型权重"},
"bench": {"en": "Run full benchmark", "zh": "运行完整基准测试"},
"microbench": {"en": "Run micro-benchmark", "zh": "运行微基准测试"},
"doctor": {"en": "Diagnose environment issues", "zh": "诊断环境问题"},
"model": {"en": "Manage models and storage paths", "zh": "管理模型和存储路径"},
"config": {"en": "Manage configuration", "zh": "管理配置"},
"sft": {"en": "Fine-tuning with LlamaFactory", "zh": "使用 LlamaFactory 进行微调"},
}
lang = get_lang()
return help_texts.get(key, {}).get(lang, help_texts.get(key, {}).get("en", key))
# Create main app with dynamic help
app = typer.Typer(
name="kt",
help="KTransformers CLI - A unified command-line interface for KTransformers.",
no_args_is_help=True,
add_completion=False, # Use static completion scripts instead of dynamic completion
rich_markup_mode="rich",
)
def _update_help_texts() -> None:
"""Update all help texts based on current language setting."""
# Update main app help
app.info.help = _get_app_help()
# Update command help texts
for cmd_info in app.registered_commands:
# cmd_info is a CommandInfo object
if hasattr(cmd_info, "name") and cmd_info.name:
cmd_info.help = _get_help(cmd_info.name)
# Update sub-app help texts
for group_info in app.registered_groups:
if hasattr(group_info, "name") and group_info.name:
group_info.help = _get_help(group_info.name)
# Register commands
app.command(name="version", help="Show version information")(version.version)
app.command(name="run", help="Start model inference server")(run.run)
app.command(name="chat", help="Interactive chat with running model")(chat.chat)
app.command(name="quant", help="Quantize model weights")(quant.quant)
app.command(name="bench", help="Run full benchmark")(bench.bench)
app.command(name="microbench", help="Run micro-benchmark")(bench.microbench)
app.command(name="doctor", help="Diagnose environment issues")(doctor.doctor)
# Register sub-apps
app.add_typer(model.app, name="model", help="Manage models and storage paths")
app.add_typer(config.app, name="config", help="Manage configuration")
app.add_typer(sft.app, name="sft", help="Fine-tuning with LlamaFactory")
def check_first_run() -> None:
"""Check if this is the first run and prompt for language setup."""
import os
# Skip if not running in interactive terminal
if not sys.stdin.isatty():
return
from kt_kernel.cli.config.settings import DEFAULT_CONFIG_FILE
# Only check if config file exists - don't create it yet
if not DEFAULT_CONFIG_FILE.exists():
# First run - show welcome and language selection
from kt_kernel.cli.config.settings import get_settings
settings = get_settings()
_show_first_run_setup(settings)
else:
# Config exists - check if initialized
from kt_kernel.cli.config.settings import get_settings
settings = get_settings()
if not settings.get("general._initialized"):
_show_first_run_setup(settings)
def _show_first_run_setup(settings) -> None:
"""Show first-run setup wizard."""
from rich.console import Console
from rich.panel import Panel
from rich.prompt import Prompt, Confirm
from rich.spinner import Spinner
from rich.live import Live
from kt_kernel.cli.utils.environment import scan_storage_locations, format_size_gb, scan_models_in_location
console = Console()
# Welcome message
console.print()
console.print(
Panel.fit(
"[bold cyan]Welcome to KTransformers CLI! / 欢迎使用 KTransformers CLI![/bold cyan]\n\n"
"Let's set up your preferences.\n"
"让我们设置您的偏好。",
title="kt-cli",
border_style="cyan",
)
)
console.print()
# Language selection
console.print("[bold]Select your preferred language / 选择您的首选语言:[/bold]")
console.print()
console.print(" [cyan][1][/cyan] English")
console.print(" [cyan][2][/cyan] 中文 (Chinese)")
console.print()
while True:
choice = Prompt.ask("Enter choice / 输入选择", choices=["1", "2"], default="1")
if choice == "1":
lang = "en"
break
elif choice == "2":
lang = "zh"
break
# Save language setting
settings.set("general.language", lang)
set_lang(lang)
# Confirmation message
console.print()
if lang == "zh":
console.print("[green]✓[/green] 语言已设置为中文")
else:
console.print("[green]✓[/green] Language set to English")
# Model storage path selection
console.print()
console.print(f"[bold]{t('setup_model_path_title')}[/bold]")
console.print()
console.print(f"[dim]{t('setup_model_path_desc')}[/dim]")
console.print()
# Scan for storage locations
console.print(f"[dim]{t('setup_scanning_disks')}[/dim]")
locations = scan_storage_locations(min_size_gb=50.0)
console.print()
if locations:
# Scan for models in each location
console.print(f"[dim]{t('setup_scanning_models')}[/dim]")
location_models: dict[str, list] = {}
for loc in locations[:5]:
models = scan_models_in_location(loc, max_depth=2)
if models:
location_models[loc.path] = models
console.print()
# Show options
for i, loc in enumerate(locations[:5], 1): # Show top 5 options
available = format_size_gb(loc.available_gb)
total = format_size_gb(loc.total_gb)
# Build the option string
if i == 1:
option_str = t("setup_disk_option_recommended", path=loc.path, available=available, total=total)
else:
option_str = t("setup_disk_option", path=loc.path, available=available, total=total)
# Add model count if any
if loc.path in location_models:
model_count = len(location_models[loc.path])
option_str += f" [green]✓ {t('setup_location_has_models', count=model_count)}[/green]"
console.print(f" [cyan][{i}][/cyan] {option_str}")
# Show first few models found in this location
if loc.path in location_models:
for model in location_models[loc.path][:3]: # Show up to 3 models
size_str = format_size_gb(model.size_gb)
console.print(f" [dim]• {model.name} ({size_str})[/dim]")
if len(location_models[loc.path]) > 3:
remaining = len(location_models[loc.path]) - 3
console.print(f" [dim] ... +{remaining} more[/dim]")
# Custom path option
custom_idx = min(len(locations), 5) + 1
console.print(f" [cyan][{custom_idx}][/cyan] {t('setup_custom_path')}")
console.print()
valid_choices = [str(i) for i in range(1, custom_idx + 1)]
path_choice = Prompt.ask(t("prompt_select"), choices=valid_choices, default="1")
if path_choice == str(custom_idx):
# Custom path
selected_path = _prompt_custom_path(console, settings)
else:
selected_path = locations[int(path_choice) - 1].path
else:
# No large storage found, ask for custom path
console.print(f"[yellow]{t('setup_no_large_disk')}[/yellow]")
console.print()
selected_path = _prompt_custom_path(console, settings)
# Ensure the path exists
import os
from pathlib import Path
if not os.path.exists(selected_path):
if Confirm.ask(t("setup_path_not_exist"), default=True):
try:
Path(selected_path).mkdir(parents=True, exist_ok=True)
except (OSError, PermissionError) as e:
console.print(f"[red]{t('error')}: {e}[/red]")
# Fall back to default
selected_path = str(Path.home() / ".ktransformers" / "models")
Path(selected_path).mkdir(parents=True, exist_ok=True)
# Check available space and warn if low
from kt_kernel.cli.utils.environment import detect_disk_space_gb
available_gb, _ = detect_disk_space_gb(
selected_path if os.path.exists(selected_path) else str(Path(selected_path).parent)
)
if available_gb < 100:
console.print(f"[yellow]{t('setup_path_low_space')}[/yellow]")
# Save the path
settings.set("paths.models", selected_path)
settings.set("general._initialized", True)
console.print()
console.print(f"[green]✓[/green] {t('setup_model_path_set', path=selected_path)}")
console.print()
# Tips
if lang == "zh":
console.print("[dim]提示: 运行 'kt config show' 查看所有配置[/dim]")
else:
console.print("[dim]Tip: Run 'kt config show' to view all settings[/dim]")
console.print()
def _prompt_custom_path(console, settings) -> str:
"""Prompt user to enter a custom path."""
from rich.prompt import Prompt
from pathlib import Path
import os
default_path = str(Path.home() / ".ktransformers" / "models")
while True:
custom_path = Prompt.ask(t("setup_enter_custom_path"), default=default_path)
# Expand user home
custom_path = os.path.expanduser(custom_path)
# Check if path exists or parent is writable
if os.path.exists(custom_path):
if os.access(custom_path, os.W_OK):
return custom_path
else:
console.print(f"[red]{t('setup_path_no_write')}[/red]")
else:
# Check if we can create it (parent writable)
parent = str(Path(custom_path).parent)
while not os.path.exists(parent) and parent != "/":
parent = str(Path(parent).parent)
if os.access(parent, os.W_OK):
return custom_path
else:
console.print(f"[red]{t('setup_path_no_write')}[/red]")
def _install_shell_completion() -> None:
"""Install shell completion scripts to user directories.
Uses standard locations that are auto-loaded by shell completion systems:
- Bash: ~/.local/share/bash-completion/completions/kt (auto-loaded by bash-completion 2.0+)
- Zsh: ~/.zfunc/_kt (requires fpath setup, but commonly used)
- Fish: ~/.config/fish/completions/kt.fish (auto-loaded)
"""
import os
import shutil
from pathlib import Path
from kt_kernel.cli.config.settings import get_settings
settings = get_settings()
# Check if already installed
if settings.get("general._completion_installed", False):
return
# Detect current shell
shell = os.environ.get("SHELL", "")
if "zsh" in shell:
shell_name = "zsh"
elif "fish" in shell:
shell_name = "fish"
else:
shell_name = "bash"
try:
cli_dir = Path(__file__).parent
completions_dir = cli_dir / "completions"
home = Path.home()
installed = False
if shell_name == "bash":
# Use XDG standard location for bash-completion (auto-loaded)
src_file = completions_dir / "kt-completion.bash"
dest_dir = home / ".local" / "share" / "bash-completion" / "completions"
dest_file = dest_dir / "kt"
if src_file.exists():
dest_dir.mkdir(parents=True, exist_ok=True)
shutil.copy2(src_file, dest_file)
installed = True
elif shell_name == "zsh":
src_file = completions_dir / "_kt"
dest_dir = home / ".zfunc"
dest_file = dest_dir / "_kt"
if src_file.exists():
dest_dir.mkdir(parents=True, exist_ok=True)
shutil.copy2(src_file, dest_file)
installed = True
elif shell_name == "fish":
# Fish auto-loads from this directory
src_file = completions_dir / "kt.fish"
dest_dir = home / ".config" / "fish" / "completions"
dest_file = dest_dir / "kt.fish"
if src_file.exists():
dest_dir.mkdir(parents=True, exist_ok=True)
shutil.copy2(src_file, dest_file)
installed = True
# Mark as installed
settings.set("general._completion_installed", True)
# For bash/zsh, completion will work in new terminals automatically
# (bash-completion 2.0+ auto-loads from ~/.local/share/bash-completion/completions/)
except (OSError, IOError):
# Silently ignore errors - completion is not critical
pass
def _apply_saved_language() -> None:
"""Apply the saved language setting.
Priority:
1. KT_LANG environment variable (if already set, don't override)
2. Config file setting
3. System locale (auto)
"""
import os
# Don't override if KT_LANG is already set by user
if os.environ.get("KT_LANG"):
return
from kt_kernel.cli.config.settings import get_settings
settings = get_settings()
lang = settings.get("general.language", "auto")
if lang != "auto":
set_lang(lang)
def main():
"""Main entry point."""
# Apply saved language setting first (before anything else for correct help display)
_apply_saved_language()
# Update help texts based on language
_update_help_texts()
# Check for first run (but not for certain commands)
# Skip first-run check for: --help, config commands, version
args = sys.argv[1:] if len(sys.argv) > 1 else []
skip_commands = ["--help", "-h", "config", "version", "--version"]
should_check_first_run = True
for arg in args:
if arg in skip_commands:
should_check_first_run = False
break
# Auto-install shell completion on first run
if should_check_first_run:
_install_shell_completion()
# Check first run before running commands
if should_check_first_run and args:
check_first_run()
app()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,6 @@
# Inference dependencies for KTransformers
# NOTE: sglang is installed separately from source (see install.py)
transformers>=4.45.0
safetensors>=0.4.0
huggingface-hub>=0.20.0

View File

@@ -0,0 +1,7 @@
# SFT (Supervised Fine-Tuning) dependencies for KTransformers
llamafactory>=0.9.0
peft>=0.12.0
transformers>=4.45.0
datasets>=2.14.0
accelerate>=0.30.0

View File

@@ -0,0 +1,3 @@
"""
Utility modules for kt-cli.
"""

View File

@@ -0,0 +1,249 @@
"""
Console utilities for kt-cli.
Provides Rich-based console output helpers for consistent formatting.
"""
from typing import Optional
from rich.console import Console
from rich.panel import Panel
from rich.progress import (
BarColumn,
DownloadColumn,
Progress,
SpinnerColumn,
TaskProgressColumn,
TextColumn,
TimeElapsedColumn,
TimeRemainingColumn,
TransferSpeedColumn,
)
from rich.prompt import Confirm, Prompt
from rich.table import Table
from rich.theme import Theme
from kt_kernel.cli.i18n import t
# Custom theme for kt-cli
KT_THEME = Theme(
{
"info": "cyan",
"warning": "yellow",
"error": "bold red",
"success": "bold green",
"highlight": "bold magenta",
"muted": "dim",
}
)
# Global console instance
console = Console(theme=KT_THEME)
def print_info(message: str, **kwargs) -> None:
"""Print an info message."""
console.print(f"[info][/info] {message}", **kwargs)
def print_success(message: str, **kwargs) -> None:
"""Print a success message."""
console.print(f"[success]✓[/success] {message}", **kwargs)
def print_warning(message: str, **kwargs) -> None:
"""Print a warning message."""
console.print(f"[warning]⚠[/warning] {message}", **kwargs)
def print_error(message: str, **kwargs) -> None:
"""Print an error message."""
console.print(f"[error]✗[/error] {message}", **kwargs)
def print_step(message: str, **kwargs) -> None:
"""Print a step indicator."""
console.print(f"[highlight]→[/highlight] {message}", **kwargs)
def print_header(title: str, subtitle: Optional[str] = None) -> None:
"""Print a header panel."""
content = f"[bold]{title}[/bold]"
if subtitle:
content += f"\n[muted]{subtitle}[/muted]"
console.print(Panel(content, expand=False))
def print_version_table(versions: dict[str, Optional[str]]) -> None:
"""Print a version information table."""
table = Table(show_header=False, box=None, padding=(0, 2))
table.add_column("Component", style="bold")
table.add_column("Version")
for name, version in versions.items():
if version:
table.add_row(name, f"[success]{version}[/success]")
else:
table.add_row(name, f"[muted]{t('version_not_installed')}[/muted]")
console.print(table)
def print_dependency_table(deps: list[dict]) -> None:
"""Print a dependency status table."""
table = Table(title=t("install_checking_deps"))
table.add_column(t("version_info"), style="bold")
table.add_column("Current")
table.add_column("Required")
table.add_column("Status")
for dep in deps:
status = dep.get("status", "ok")
if status == "ok":
status_str = f"[success]{t('install_dep_ok')}[/success]"
elif status == "outdated":
status_str = f"[warning]{t('install_dep_outdated')}[/warning]"
else:
status_str = f"[error]{t('install_dep_missing')}[/error]"
table.add_row(
dep["name"],
dep.get("installed", "-"),
dep.get("required", "-"),
status_str,
)
console.print(table)
def confirm(message: str, default: bool = True) -> bool:
"""Ask for confirmation."""
return Confirm.ask(message, default=default, console=console)
def prompt_choice(message: str, choices: list[str], default: Optional[str] = None) -> str:
"""Prompt for a choice from a list."""
# Display numbered choices
console.print(f"\n[bold]{message}[/bold]")
for i, choice in enumerate(choices, 1):
console.print(f" [highlight][{i}][/highlight] {choice}")
while True:
response = Prompt.ask(
"\n" + t("prompt_select"),
console=console,
default=str(choices.index(default) + 1) if default else None,
)
try:
idx = int(response) - 1
if 0 <= idx < len(choices):
return choices[idx]
except ValueError:
# Check if response matches a choice directly
if response in choices:
return response
print_error(f"Please enter a number between 1 and {len(choices)}")
def prompt_text(message: str, default: Optional[str] = None) -> str:
"""Prompt for text input."""
return Prompt.ask(message, console=console, default=default)
def create_progress() -> Progress:
"""Create a progress bar for general tasks."""
return Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeElapsedColumn(),
console=console,
)
def create_download_progress() -> Progress:
"""Create a progress bar for downloads."""
return Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
DownloadColumn(),
TransferSpeedColumn(),
TimeRemainingColumn(),
console=console,
)
def print_model_table(models: list[dict]) -> None:
"""Print a table of models."""
table = Table(title=t("download_list_title"))
table.add_column("Name", style="bold")
table.add_column("Repository")
table.add_column("Type")
table.add_column("Requirements")
for model in models:
reqs = []
if model.get("gpu_vram_gb"):
reqs.append(f"GPU: {model['gpu_vram_gb']}GB")
if model.get("cpu_ram_gb"):
reqs.append(f"RAM: {model['cpu_ram_gb']}GB")
table.add_row(
model.get("name", ""),
model.get("hf_repo", ""),
model.get("type", ""),
", ".join(reqs) if reqs else "-",
)
console.print(table)
def print_hardware_info(gpu_info: str, cpu_info: str, ram_info: str) -> None:
"""Print hardware information."""
table = Table(show_header=False, box=None)
table.add_column("Icon", width=3)
table.add_column("Info")
table.add_row("🖥️", gpu_info)
table.add_row("💻", cpu_info)
table.add_row("🧠", ram_info)
console.print(Panel(table, title="Hardware", expand=False))
def print_server_info(
mode: str, host: str, port: int, gpu_experts: int, cpu_threads: int
) -> None:
"""Print server startup information."""
table = Table(show_header=False, box=None)
table.add_column("Key", style="bold")
table.add_column("Value")
table.add_row(t("run_server_mode").split(":")[0], mode)
table.add_row("Host", host)
table.add_row("Port", str(port))
table.add_row(t("run_gpu_experts").split(":")[0], f"{gpu_experts}/layer")
table.add_row(t("run_cpu_threads").split(":")[0], str(cpu_threads))
console.print(Panel(table, title=t("run_server_started"), expand=False, border_style="green"))
def print_api_info(host: str, port: int) -> None:
"""Print API endpoint information."""
api_url = f"http://{host}:{port}"
docs_url = f"http://{host}:{port}/docs"
console.print()
console.print(f" {t('run_api_url', host=host, port=port)}")
console.print(f" {t('run_docs_url', host=host, port=port)}")
console.print()
console.print(f" [muted]Test command:[/muted]")
console.print(
f" [dim]curl {api_url}/v1/chat/completions -H 'Content-Type: application/json' "
f"-d '{{\"model\": \"default\", \"messages\": [{{\"role\": \"user\", \"content\": \"Hello\"}}]}}'[/dim]"
)
console.print()
console.print(f" [muted]{t('run_stop_hint')}[/muted]")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,374 @@
"""
Model registry for kt-cli.
Provides a registry of supported models with fuzzy matching capabilities.
"""
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Optional
import yaml
from kt_kernel.cli.config.settings import get_settings
@dataclass
class ModelInfo:
"""Information about a supported model."""
name: str
hf_repo: str
aliases: list[str] = field(default_factory=list)
type: str = "moe" # moe, dense
gpu_vram_gb: float = 0
cpu_ram_gb: float = 0
default_params: dict = field(default_factory=dict)
description: str = ""
description_zh: str = ""
max_tensor_parallel_size: Optional[int] = None # Maximum tensor parallel size for this model
# Built-in model registry
BUILTIN_MODELS: list[ModelInfo] = [
ModelInfo(
name="DeepSeek-V3-0324",
hf_repo="deepseek-ai/DeepSeek-V3-0324",
aliases=["deepseek-v3-0324", "deepseek-v3", "dsv3", "deepseek3", "v3-0324"],
type="moe",
default_params={
"kt-num-gpu-experts": 1,
"attention-backend": "triton",
"disable-shared-experts-fusion": True,
"kt-method": "AMXINT4",
},
description="DeepSeek V3-0324 685B MoE model (March 2025, improved benchmarks)",
description_zh="DeepSeek V3-0324 685B MoE 模型2025年3月改进的基准测试",
),
ModelInfo(
name="DeepSeek-V3.2",
hf_repo="deepseek-ai/DeepSeek-V3.2",
aliases=["deepseek-v3.2", "dsv3.2", "deepseek3.2", "v3.2"],
type="moe",
default_params={
"kt-method": "FP8",
"kt-gpu-prefill-token-threshold": 4096,
"attention-backend": "flashinfer",
"fp8-gemm-backend": "triton",
"max-total-tokens": 100000,
"max-running-requests": 16,
"chunked-prefill-size": 32768,
"mem-fraction-static": 0.80,
"watchdog-timeout": 3000,
"served-model-name": "DeepSeek-V3.2",
"disable-shared-experts-fusion": True,
},
description="DeepSeek V3.2 671B MoE model (latest)",
description_zh="DeepSeek V3.2 671B MoE 模型(最新)",
),
ModelInfo(
name="DeepSeek-R1-0528",
hf_repo="deepseek-ai/DeepSeek-R1-0528",
aliases=["deepseek-r1-0528", "deepseek-r1", "dsr1", "r1", "r1-0528"],
type="moe",
default_params={
"kt-num-gpu-experts": 1,
"attention-backend": "triton",
"disable-shared-experts-fusion": True,
"kt-method": "AMXINT4",
},
description="DeepSeek R1-0528 reasoning model (May 2025, improved reasoning depth)",
description_zh="DeepSeek R1-0528 推理模型2025年5月改进的推理深度",
),
ModelInfo(
name="Kimi-K2-Thinking",
hf_repo="moonshotai/Kimi-K2-Thinking",
aliases=["kimi-k2-thinking", "kimi-thinking", "k2-thinking", "kimi", "k2"],
type="moe",
default_params={
"kt-method": "RAWINT4",
"kt-gpu-prefill-token-threshold": 400,
"attention-backend": "flashinfer",
"max-total-tokens": 100000,
"max-running-requests": 16,
"chunked-prefill-size": 32768,
"mem-fraction-static": 0.80,
"watchdog-timeout": 3000,
"served-model-name": "Kimi-K2-Thinking",
"disable-shared-experts-fusion": True,
},
description="Moonshot Kimi K2 Thinking MoE model",
description_zh="月之暗面 Kimi K2 Thinking MoE 模型",
),
ModelInfo(
name="MiniMax-M2",
hf_repo="MiniMaxAI/MiniMax-M2",
aliases=["minimax-m2", "m2"],
type="moe",
default_params={
"kt-method": "FP8",
"kt-gpu-prefill-token-threshold": 4096,
"attention-backend": "flashinfer",
"fp8-gemm-backend": "triton",
"max-total-tokens": 100000,
"max-running-requests": 16,
"chunked-prefill-size": 32768,
"mem-fraction-static": 0.80,
"watchdog-timeout": 3000,
"served-model-name": "MiniMax-M2",
"disable-shared-experts-fusion": True,
"tool-call-parser": "minimax-m2",
"reasoning-parser": "minimax-append-think",
},
description="MiniMax M2 MoE model",
description_zh="MiniMax M2 MoE 模型",
max_tensor_parallel_size=4, # M2 only supports up to 4-way tensor parallelism
),
ModelInfo(
name="MiniMax-M2.1",
hf_repo="MiniMaxAI/MiniMax-M2.1",
aliases=["minimax-m2.1", "m2.1"],
type="moe",
default_params={
"kt-method": "FP8",
"kt-gpu-prefill-token-threshold": 4096,
"attention-backend": "flashinfer",
"fp8-gemm-backend": "triton",
"max-total-tokens": 100000,
"max-running-requests": 16,
"chunked-prefill-size": 32768,
"mem-fraction-static": 0.80,
"watchdog-timeout": 3000,
"served-model-name": "MiniMax-M2.1",
"disable-shared-experts-fusion": True,
"tool-call-parser": "minimax-m2",
"reasoning-parser": "minimax-append-think",
},
description="MiniMax M2.1 MoE model (enhanced multi-language programming)",
description_zh="MiniMax M2.1 MoE 模型(增强多语言编程能力)",
max_tensor_parallel_size=4, # M2.1 only supports up to 4-way tensor parallelism
),
]
class ModelRegistry:
"""Registry of supported models with fuzzy matching."""
def __init__(self):
"""Initialize the model registry."""
self._models: dict[str, ModelInfo] = {}
self._aliases: dict[str, str] = {}
self._load_builtin_models()
self._load_user_models()
def _load_builtin_models(self) -> None:
"""Load built-in models."""
for model in BUILTIN_MODELS:
self._register(model)
def _load_user_models(self) -> None:
"""Load user-defined models from config."""
settings = get_settings()
registry_file = settings.config_dir / "registry.yaml"
if registry_file.exists():
try:
with open(registry_file, "r", encoding="utf-8") as f:
data = yaml.safe_load(f) or {}
for name, info in data.get("models", {}).items():
model = ModelInfo(
name=name,
hf_repo=info.get("hf_repo", ""),
aliases=info.get("aliases", []),
type=info.get("type", "moe"),
gpu_vram_gb=info.get("gpu_vram_gb", 0),
cpu_ram_gb=info.get("cpu_ram_gb", 0),
default_params=info.get("default_params", {}),
description=info.get("description", ""),
description_zh=info.get("description_zh", ""),
max_tensor_parallel_size=info.get("max_tensor_parallel_size"),
)
self._register(model)
except (yaml.YAMLError, OSError):
pass
def _register(self, model: ModelInfo) -> None:
"""Register a model."""
self._models[model.name.lower()] = model
# Register aliases
for alias in model.aliases:
self._aliases[alias.lower()] = model.name.lower()
def get(self, name: str) -> Optional[ModelInfo]:
"""Get a model by exact name or alias."""
name_lower = name.lower()
# Check direct match
if name_lower in self._models:
return self._models[name_lower]
# Check aliases
if name_lower in self._aliases:
return self._models[self._aliases[name_lower]]
return None
def search(self, query: str, limit: int = 10) -> list[ModelInfo]:
"""Search for models using fuzzy matching.
Args:
query: Search query
limit: Maximum number of results
Returns:
List of matching models, sorted by relevance
"""
query_lower = query.lower()
results: list[tuple[float, ModelInfo]] = []
for model in self._models.values():
score = self._match_score(query_lower, model)
if score > 0:
results.append((score, model))
# Sort by score descending
results.sort(key=lambda x: x[0], reverse=True)
return [model for _, model in results[:limit]]
def _match_score(self, query: str, model: ModelInfo) -> float:
"""Calculate match score for a model.
Returns a score between 0 and 1, where 1 is an exact match.
"""
# Check exact match
if query == model.name.lower():
return 1.0
# Check alias exact match
for alias in model.aliases:
if query == alias.lower():
return 0.95
# Check if query is contained in name
if query in model.name.lower():
return 0.8
# Check if query is contained in aliases
for alias in model.aliases:
if query in alias.lower():
return 0.7
# Check if query is contained in hf_repo
if query in model.hf_repo.lower():
return 0.6
# Fuzzy matching - check if all query parts are present
query_parts = re.split(r"[-_.\s]", query)
name_lower = model.name.lower()
matches = sum(1 for part in query_parts if part and part in name_lower)
if matches > 0:
return 0.5 * (matches / len(query_parts))
return 0.0
def list_all(self) -> list[ModelInfo]:
"""List all registered models."""
return list(self._models.values())
def find_local_models(self) -> list[tuple[ModelInfo, Path]]:
"""Find models that are downloaded locally in any configured model path.
Returns:
List of (ModelInfo, path) tuples for local models
"""
settings = get_settings()
model_paths = settings.get_model_paths()
results = []
for model in self._models.values():
found = False
# Search in all configured model directories
for models_dir in model_paths:
if not models_dir.exists():
continue
# Check common path patterns
possible_paths = [
models_dir / model.name,
models_dir / model.name.lower(),
models_dir / model.hf_repo.split("/")[-1],
models_dir / model.hf_repo.replace("/", "--"),
]
for path in possible_paths:
if path.exists() and (path / "config.json").exists():
results.append((model, path))
found = True
break
if found:
break
return results
# Global registry instance
_registry: Optional[ModelRegistry] = None
def get_registry() -> ModelRegistry:
"""Get the global model registry instance."""
global _registry
if _registry is None:
_registry = ModelRegistry()
return _registry
# ============================================================================
# Model-specific parameter computation functions
# ============================================================================
def compute_deepseek_v3_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
per_gpu_gb = 16
if vram_per_gpu_gb < per_gpu_gb:
return int(0)
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
return total_vram // 3
def compute_kimi_k2_thinking_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
"""Compute kt-num-gpu-experts for Kimi K2 Thinking."""
per_gpu_gb = 16
if vram_per_gpu_gb < per_gpu_gb:
return int(0)
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
return total_vram * 2 // 3
def compute_minimax_m2_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
"""Compute kt-num-gpu-experts for MiniMax M2/M2.1."""
per_gpu_gb = 16
if vram_per_gpu_gb < per_gpu_gb:
return int(0)
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
return total_vram // 1
# Model name to computation function mapping
MODEL_COMPUTE_FUNCTIONS: dict[str, Callable[[int, float], int]] = {
"DeepSeek-V3-0324": compute_deepseek_v3_gpu_experts,
"DeepSeek-V3.2": compute_deepseek_v3_gpu_experts, # Same as V3-0324
"DeepSeek-R1-0528": compute_deepseek_v3_gpu_experts, # Same as V3-0324
"Kimi-K2-Thinking": compute_kimi_k2_thinking_gpu_experts,
"MiniMax-M2": compute_minimax_m2_gpu_experts,
"MiniMax-M2.1": compute_minimax_m2_gpu_experts, # Same as M2
}

View File

@@ -0,0 +1,407 @@
"""
SGLang installation checker and installation instructions provider.
This module provides utilities to:
- Check if SGLang is installed and get its metadata
- Provide installation instructions when SGLang is not found
"""
import subprocess
import sys
from pathlib import Path
from typing import Optional
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import console
def check_sglang_installation() -> dict:
"""Check if SGLang is installed and get its metadata.
Returns:
dict with keys:
- installed: bool
- version: str or None
- location: str or None (installation path)
- editable: bool (whether installed in editable mode)
- git_info: dict or None (git remote and branch if available)
- from_source: bool (whether installed from source repository)
"""
try:
# Try to import sglang
import sglang
version = getattr(sglang, "__version__", None)
# Use pip show to get detailed package information
location = None
editable = False
git_info = None
from_source = False
try:
# Get pip show output
result = subprocess.run(
[sys.executable, "-m", "pip", "show", "sglang"],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0:
pip_info = {}
for line in result.stdout.split("\n"):
if ":" in line:
key, value = line.split(":", 1)
pip_info[key.strip()] = value.strip()
location = pip_info.get("Location")
editable_location = pip_info.get("Editable project location")
if editable_location:
editable = True
location = editable_location
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
# Fallback to module location
if hasattr(sglang, "__file__") and sglang.__file__:
location = str(Path(sglang.__file__).parent.parent)
# Check if it's installed from source (has .git directory)
if location:
git_root = None
check_path = Path(location)
# Check current directory and up to 2 parent directories
for _ in range(3):
git_dir = check_path / ".git"
if git_dir.exists():
git_root = check_path
from_source = True
break
if check_path.parent == check_path: # Reached root
break
check_path = check_path.parent
if from_source and git_root:
# Try to get git remote and branch info
try:
# Get remote URL
result = subprocess.run(
["git", "remote", "get-url", "origin"],
cwd=git_root,
capture_output=True,
text=True,
timeout=5,
)
remote_url = result.stdout.strip() if result.returncode == 0 else None
# Extract org/repo from URL
remote_short = None
if remote_url:
# Handle both https and git@ URLs
if "github.com" in remote_url:
parts = remote_url.rstrip("/").replace(".git", "").split("github.com")[-1]
remote_short = parts.lstrip("/").lstrip(":")
# Get current branch
result = subprocess.run(
["git", "branch", "--show-current"],
cwd=git_root,
capture_output=True,
text=True,
timeout=5,
)
branch = result.stdout.strip() if result.returncode == 0 else None
if remote_url or branch:
git_info = {
"remote": remote_short or remote_url,
"branch": branch,
}
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
pass
return {
"installed": True,
"version": version,
"location": location,
"editable": editable,
"git_info": git_info,
"from_source": from_source,
}
except ImportError:
return {
"installed": False,
"version": None,
"location": None,
"editable": False,
"git_info": None,
"from_source": False,
}
def get_sglang_install_instructions(lang: Optional[str] = None) -> str:
"""Get SGLang installation instructions.
Args:
lang: Language code ('en' or 'zh'). If None, uses current language setting.
Returns:
Formatted installation instructions string.
"""
from kt_kernel.cli.i18n import get_lang
if lang is None:
lang = get_lang()
if lang == "zh":
return """
[bold yellow]SGLang \u672a\u5b89\u88c5[/bold yellow]
\u8bf7\u6309\u7167\u4ee5\u4e0b\u6b65\u9aa4\u5b89\u88c5 SGLang:
[bold]1. \u514b\u9686\u4ed3\u5e93:[/bold]
git clone https://github.com/kvcache-ai/sglang.git
cd sglang
[bold]2. \u5b89\u88c5 (\u4e8c\u9009\u4e00):[/bold]
[cyan]\u65b9\u5f0f A - pip \u5b89\u88c5 (\u63a8\u8350):[/cyan]
pip install -e "python[all]"
[cyan]\u65b9\u5f0f B - uv \u5b89\u88c5 (\u66f4\u5feb):[/cyan]
pip install uv
uv pip install -e "python[all]"
[dim]\u6ce8\u610f: \u8bf7\u786e\u4fdd\u5728\u6b63\u786e\u7684 Python \u73af\u5883\u4e2d\u6267\u884c\u4ee5\u4e0a\u547d\u4ee4[/dim]
"""
else:
return """
[bold yellow]SGLang is not installed[/bold yellow]
Please follow these steps to install SGLang:
[bold]1. Clone the repository:[/bold]
git clone https://github.com/kvcache-ai/sglang.git
cd sglang
[bold]2. Install (choose one):[/bold]
[cyan]Option A - pip install (recommended):[/cyan]
pip install -e "python[all]"
[cyan]Option B - uv install (faster):[/cyan]
pip install uv
uv pip install -e "python[all]"
[dim]Note: Make sure to run these commands in the correct Python environment[/dim]
"""
def print_sglang_install_instructions() -> None:
"""Print SGLang installation instructions to console."""
instructions = get_sglang_install_instructions()
console.print(instructions)
def check_sglang_and_warn() -> bool:
"""Check if SGLang is installed, print warning if not.
Returns:
True if SGLang is installed, False otherwise.
"""
info = check_sglang_installation()
if not info["installed"]:
print_sglang_install_instructions()
return False
# Check if installed from PyPI (not recommended)
if info["installed"] and not info["from_source"]:
from kt_kernel.cli.utils.console import print_warning
print_warning(t("sglang_pypi_warning"))
console.print()
console.print("[dim]" + t("sglang_recommend_source") + "[/dim]")
console.print()
return True
def _get_sglang_kt_kernel_cache_path() -> Path:
"""Get the path to the sglang kt-kernel support cache file."""
cache_dir = Path.home() / ".ktransformers" / "cache"
cache_dir.mkdir(parents=True, exist_ok=True)
return cache_dir / "sglang_kt_kernel_supported"
def _is_sglang_kt_kernel_cache_valid() -> bool:
"""Check if the sglang kt-kernel support cache is valid.
The cache is considered valid if:
1. The cache file exists
2. The cache file contains 'true' (indicating previous check passed)
Returns:
True if cache is valid and indicates support, False otherwise.
"""
cache_path = _get_sglang_kt_kernel_cache_path()
if cache_path.exists():
try:
content = cache_path.read_text().strip().lower()
return content == "true"
except (OSError, IOError):
pass
return False
def _save_sglang_kt_kernel_cache(supported: bool) -> None:
"""Save the sglang kt-kernel support check result to cache."""
cache_path = _get_sglang_kt_kernel_cache_path()
try:
cache_path.write_text("true" if supported else "false")
except (OSError, IOError):
pass # Ignore cache write errors
def clear_sglang_kt_kernel_cache() -> None:
"""Clear the sglang kt-kernel support cache, forcing a re-check on next run."""
cache_path = _get_sglang_kt_kernel_cache_path()
try:
if cache_path.exists():
cache_path.unlink()
except (OSError, IOError):
pass
def check_sglang_kt_kernel_support(use_cache: bool = True, silent: bool = False) -> dict:
"""Check if SGLang supports kt-kernel parameters (--kt-gpu-prefill-token-threshold).
This function runs `python -m sglang.launch_server --help` and checks if the
output contains the `--kt-gpu-prefill-token-threshold` parameter. This parameter
is only available in the kvcache-ai/sglang fork, not in the official sglang.
The result is cached after the first successful check to avoid repeated checks.
Args:
use_cache: If True, use cached result if available. Default is True.
silent: If True, don't print checking message. Default is False.
Returns:
dict with keys:
- supported: bool - True if kt-kernel parameters are supported
- help_output: str or None - The help output from sglang.launch_server
- error: str or None - Error message if check failed
- from_cache: bool - True if result was from cache
"""
from kt_kernel.cli.utils.console import print_step
# Check cache first
if use_cache and _is_sglang_kt_kernel_cache_valid():
return {
"supported": True,
"help_output": None,
"error": None,
"from_cache": True,
}
# Print checking message
if not silent:
print_step(t("sglang_checking_kt_kernel_support"))
try:
result = subprocess.run(
[sys.executable, "-m", "sglang.launch_server", "--help"],
capture_output=True,
text=True,
timeout=30,
)
help_output = result.stdout + result.stderr
# Check if --kt-gpu-prefill-token-threshold is in the help output
supported = "--kt-gpu-prefill-token-threshold" in help_output
# Save to cache if supported
if supported:
_save_sglang_kt_kernel_cache(True)
return {
"supported": supported,
"help_output": help_output,
"error": None,
"from_cache": False,
}
except subprocess.TimeoutExpired:
return {
"supported": False,
"help_output": None,
"error": "Timeout while checking sglang.launch_server --help",
"from_cache": False,
}
except FileNotFoundError:
return {
"supported": False,
"help_output": None,
"error": "Python interpreter not found",
"from_cache": False,
}
except Exception as e:
return {
"supported": False,
"help_output": None,
"error": str(e),
"from_cache": False,
}
def print_sglang_kt_kernel_instructions() -> None:
"""Print instructions for installing the kvcache-ai fork of SGLang with kt-kernel support."""
from kt_kernel.cli.i18n import get_lang
lang = get_lang()
if lang == "zh":
instructions = """
[bold red]SGLang 不支持 kt-kernel[/bold red]
您当前安装的 SGLang 不包含 kt-kernel 支持。
kt-kernel 需要使用 kvcache-ai 维护的 SGLang 分支。
[bold]请按以下步骤重新安装 SGLang:[/bold]
[cyan]1. 卸载当前的 SGLang:[/cyan]
pip uninstall sglang -y
[cyan]2. 克隆 kvcache-ai 的 SGLang 仓库:[/cyan]
git clone https://github.com/kvcache-ai/sglang.git
cd sglang
[cyan]3. 安装 SGLang:[/cyan]
pip install -e "python[all]"
[dim]注意: 请确保在正确的 Python 环境中执行以上命令[/dim]
"""
else:
instructions = """
[bold red]SGLang does not support kt-kernel[/bold red]
Your current SGLang installation does not include kt-kernel support.
kt-kernel requires the kvcache-ai maintained fork of SGLang.
[bold]Please reinstall SGLang with the following steps:[/bold]
[cyan]1. Uninstall current SGLang:[/cyan]
pip uninstall sglang -y
[cyan]2. Clone the kvcache-ai SGLang repository:[/cyan]
git clone https://github.com/kvcache-ai/sglang.git
cd sglang
[cyan]3. Install SGLang:[/cyan]
pip install -e "python[all]"
[dim]Note: Make sure to run these commands in the correct Python environment[/dim]
"""
console.print(instructions)

View File

@@ -17,7 +17,7 @@ from typing import List, Optional
from .experts_base import BaseMoEWrapper, KExpertsCPUBuffer
# Import backend implementations
from .utils.amx import AMXMoEWrapper, RAWAMXMoEWrapper
from .utils.amx import AMXMoEWrapper, NativeMoEWrapper
from .utils.llamafile import LlamafileMoEWrapper
from .utils.moe_kernel import GeneralMoEWrapper
@@ -77,7 +77,7 @@ class KTMoEWrapper:
chunked_prefill_size: Maximum prefill chunk size
cpu_save: Whether to save weights to CPU memory
max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
method: Backend method ("AMXINT4", "AMXINT8", "RAWINT4", "LLAMAFILE", "MOE_INT4", "MOE_INT8")
method: Backend method ("AMXINT4", "AMXINT8", "RAWINT4", "FP8", "LLAMAFILE", "MOE_INT4", "MOE_INT8")
Returns:
An instance of the appropriate backend implementation (e.g., AMXMoEWrapper)
@@ -85,8 +85,8 @@ class KTMoEWrapper:
# Select backend based on method
if method in ["AMXINT4", "AMXINT8"]:
backend_cls = AMXMoEWrapper
elif method == "RAWINT4":
backend_cls = RAWAMXMoEWrapper
elif method in ["RAWINT4", "FP8"]:
backend_cls = NativeMoEWrapper
elif method == "LLAMAFILE":
backend_cls = LlamafileMoEWrapper
elif method in ["MOE_INT4", "MOE_INT8"]:

View File

@@ -4,13 +4,13 @@
Utilities for kt_kernel package.
"""
from .amx import AMXMoEWrapper, RAWAMXMoEWrapper
from .amx import AMXMoEWrapper, NativeMoEWrapper
from .llamafile import LlamafileMoEWrapper
from .loader import SafeTensorLoader, GGUFLoader, CompressedSafeTensorLoader
__all__ = [
"AMXMoEWrapper",
"RAWAMXMoEWrapper",
"NativeMoEWrapper",
"LlamafileMoEWrapper",
"SafeTensorLoader",
"CompressedSafeTensorLoader",

View File

@@ -4,16 +4,16 @@ import ctypes
# Use relative imports for package structure
from ..experts_base import BaseMoEWrapper
from .loader import SafeTensorLoader, CompressedSafeTensorLoader
from .loader import SafeTensorLoader, CompressedSafeTensorLoader, FP8SafeTensorLoader
from kt_kernel_ext.moe import MOEConfig
try:
from kt_kernel_ext.moe import AMXInt4_MOE, AMXInt8_MOE, AMXInt4_KGroup_MOE
from kt_kernel_ext.moe import AMXInt4_MOE, AMXInt8_MOE, AMXInt4_KGroup_MOE, AMXFP8_MOE
_HAS_AMX_SUPPORT = True
except (ImportError, AttributeError):
_HAS_AMX_SUPPORT = False
AMXInt4_MOE, AMXInt8_MOE, AMXInt4_KGroup_MOE = None, None, None
AMXInt4_MOE, AMXInt8_MOE, AMXInt4_KGroup_MOE, AMXFP8_MOE = None, None, None, None
from typing import Optional
@@ -303,10 +303,10 @@ class AMXMoEWrapper(BaseMoEWrapper):
del self.down_scales
class RAWAMXMoEWrapper(BaseMoEWrapper):
"""Wrapper for RAWINT4 experts stored in compressed SafeTensor format."""
class NativeMoEWrapper(BaseMoEWrapper):
"""Wrapper for RAWINT4/FP8 experts stored in compressed SafeTensor format."""
_compressed_loader_instance = None
_native_loader_instance = None
def __init__(
self,
@@ -324,8 +324,12 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
max_deferred_experts_per_token: Optional[int] = None,
method: str = "RAWINT4",
):
if not _HAS_AMX_SUPPORT or AMXInt4_KGroup_MOE is None:
if not _HAS_AMX_SUPPORT:
raise RuntimeError("AMX backend is not available.")
if method == "RAWINT4" and AMXInt4_KGroup_MOE is None:
raise RuntimeError("AMX backend with RAWINT4 support is not available.")
if method == "FP8" and AMXFP8_MOE is None:
raise RuntimeError("AMX backend with FP8 support is not available.")
super().__init__(
layer_idx=layer_idx,
@@ -343,9 +347,14 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
method=method,
)
if RAWAMXMoEWrapper._compressed_loader_instance is None:
RAWAMXMoEWrapper._compressed_loader_instance = CompressedSafeTensorLoader(weight_path)
self.loader = RAWAMXMoEWrapper._compressed_loader_instance
if NativeMoEWrapper._native_loader_instance is None:
if method == "RAWINT4":
NativeMoEWrapper._native_loader_instance = CompressedSafeTensorLoader(weight_path)
elif method == "FP8":
NativeMoEWrapper._native_loader_instance = FP8SafeTensorLoader(weight_path)
else:
raise NotImplementedError(f"Unsupported method for NativeMoEWrapper: {method}")
self.loader = NativeMoEWrapper._native_loader_instance
self.gate_weights = None
self.up_weights = None
@@ -378,9 +387,17 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
self.down_weights = weights["down"]
# Convert scales to bf16 individually
self.gate_scales = [t.to(torch.bfloat16).contiguous() for t in weights["gate_scale"]]
self.up_scales = [t.to(torch.bfloat16).contiguous() for t in weights["up_scale"]]
self.down_scales = [t.to(torch.bfloat16).contiguous() for t in weights["down_scale"]]
# self.gate_scales = [t.to(torch.bfloat16).contiguous() for t in weights["gate_scale"]]
# self.up_scales = [t.to(torch.bfloat16).contiguous() for t in weights["up_scale"]]
# self.down_scales = [t.to(torch.bfloat16).contiguous() for t in weights["down_scale"]]
self.gate_scales = weights["gate_scale"]
self.up_scales = weights["up_scale"]
self.down_scales = weights["down_scale"]
if self.method == "RAWINT4":
assert self.gate_scales[0].dtype == torch.bfloat16, "Expected bf16 scales for RAWINT4"
elif self.method == "FP8":
assert self.gate_scales[0].dtype == torch.float32, "Expected float32 scales for FP8"
t2 = time.time()
# Build pointer lists: [numa_id][expert_id] -> pointer
@@ -404,18 +421,6 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
moe_config.pool = self.cpu_infer.backend_
moe_config.max_len = self.chunked_prefill_size
# Infer group_size from scale shape (column-major layout)
# For gate/up projection: in_features = hidden_size
# So: group_size = hidden_size / scale.shape[1]
scale_shape = self.gate_scales[0].shape
group_size = self.hidden_size // scale_shape[1]
print(f"[RAWAMXMoEWrapper Layer {self.layer_idx}] Inferred group_size: {group_size}")
moe_config.quant_config.bits = 4
moe_config.quant_config.group_size = group_size
moe_config.quant_config.zero_point = False
# Use gate_projs instead of gate_proj for per-expert pointers
moe_config.gate_projs = gate_ptrs
moe_config.up_projs = up_ptrs
@@ -424,7 +429,21 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
moe_config.up_scales = up_scale_ptrs
moe_config.down_scales = down_scale_ptrs
self.moe = AMXInt4_KGroup_MOE(moe_config)
# Infer group_size from scale shape (column-major layout)
# For gate/up projection: in_features = hidden_size
# So: group_size = hidden_size / scale.shape[1]
if self.method == "RAWINT4":
group_size = self.hidden_size // self.gate_scales[0].shape[1]
moe_config.quant_config.bits = 4
moe_config.quant_config.group_size = group_size
moe_config.quant_config.zero_point = False
self.moe = AMXInt4_KGroup_MOE(moe_config)
elif self.method == "FP8":
moe_config.quant_config.bits = 8
moe_config.quant_config.group_size = 128
moe_config.quant_config.zero_point = False
self.moe = AMXFP8_MOE(moe_config)
t4 = time.time()
self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
@@ -440,7 +459,7 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
t6 = time.time()
print(
f"[RAWAMXMoEWrapper Layer {self.layer_idx}] "
f"[NativeMoEWrapper Layer {self.layer_idx}] "
f"load_experts: {(t1-t0)*1000:.1f}ms, "
f"prepare_tensors: {(t2-t1)*1000:.1f}ms, "
f"build_ptrs: {(t3-t2)*1000:.1f}ms, "
@@ -453,7 +472,7 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
def submit_write_weight_scale_to_buffer(
self,
gpu_tp_count: int,
gpu_experts_num: int,
expert_id: int,
w13_weight_ptrs,
w13_scale_ptrs,
w2_weight_ptrs,
@@ -477,7 +496,7 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
self.cpu_infer.submit(
self.moe.write_weight_scale_to_buffer_task(
gpu_tp_count,
gpu_experts_num,
expert_id,
w13_weight_ptrs,
w13_scale_ptrs,
w2_weight_ptrs,

View File

@@ -219,4 +219,4 @@ class LlamafileMoEWrapper(BaseMoEWrapper):
self.cpu_infer.sync()
# Drop original weights after loading
self.weights_to_keep = None
self.weights_to_keep = None

View File

@@ -237,6 +237,117 @@ class SafeTensorLoader:
return name in self.tensor_file_map
class FP8SafeTensorLoader(SafeTensorLoader):
"""Loader for FP8 expert weights with auto-detection of naming formats.
Supported formats:
- DeepSeek style: {base}.mlp.experts.{id}.{gate,up,down}_proj.weight
- Mixtral/MiniMax style: {base}.block_sparse_moe.experts.{id}.{w1,w3,w2}.weight
The format is auto-detected during initialization.
"""
# Known MoE naming formats: (experts_path_template, gate_name, up_name, down_name)
MOE_FORMATS = {
"deepseek": ("{base}.mlp.experts", "gate_proj", "up_proj", "down_proj"),
"mixtral": ("{base}.block_sparse_moe.experts", "w1", "w3", "w2"),
}
def __init__(self, file_path: str):
super().__init__(file_path)
self._detected_format = None
self._detect_format()
def _detect_format(self):
"""Auto-detect the MoE naming format by checking tensor keys."""
# Sample some tensor names to detect format
sample_keys = list(self.tensor_file_map.keys())[:1000]
for fmt_name, (path_tpl, gate, up, down) in self.MOE_FORMATS.items():
# Check if any key matches this format pattern
# Look for pattern like: model.layers.0.{experts_path}.0.{gate_name}.weight
for key in sample_keys:
if ".experts." in key and f".{gate}.weight" in key:
# Verify the path template matches
if "block_sparse_moe.experts" in key and fmt_name == "mixtral":
self._detected_format = fmt_name
print(f"[FP8SafeTensorLoader] Detected format: {fmt_name}")
return
elif "mlp.experts" in key and "block_sparse_moe" not in key and fmt_name == "deepseek":
self._detected_format = fmt_name
print(f"[FP8SafeTensorLoader] Detected format: {fmt_name}")
return
# Default to deepseek if no format detected
self._detected_format = "deepseek"
print("[FP8SafeTensorLoader] No MoE format detected, defaulting to: deepseek")
def _get_experts_prefix(self, base_key: str) -> str:
"""Get the experts prefix based on detected format."""
path_tpl, _, _, _ = self.MOE_FORMATS[self._detected_format]
return path_tpl.format(base=base_key)
def _get_proj_names(self):
"""Get projection names (gate, up, down) based on detected format."""
_, gate, up, down = self.MOE_FORMATS[self._detected_format]
return gate, up, down
def load_tensor(self, key: str, device: str = "cpu"):
if key not in self.tensor_file_map:
raise KeyError(f"Key {key} not found in Safetensor files")
file = self.tensor_file_map[key]
f = self.file_handle_map.get(file)
if f is None:
raise FileNotFoundError(f"File {file} not found in Safetensor files")
tensor = f.get_tensor(key)
if device == "cpu":
return tensor
return tensor.to(device)
def load_experts(self, base_key: str, device: str = "cpu"):
"""Load FP8 expert weights and their block-wise scale_inv tensors."""
experts_prefix = self._get_experts_prefix(base_key)
gate_name, up_name, down_name = self._get_proj_names()
expert_count = 0
while self.has_tensor(f"{experts_prefix}.{expert_count}.{gate_name}.weight"):
expert_count += 1
if expert_count == 0:
raise ValueError(f"No experts found for key {experts_prefix}")
gate_weights = [None] * expert_count
up_weights = [None] * expert_count
down_weights = [None] * expert_count
gate_scales = [None] * expert_count
up_scales = [None] * expert_count
down_scales = [None] * expert_count
for exp_id in range(expert_count):
gate_w_key = f"{experts_prefix}.{exp_id}.{gate_name}.weight"
up_w_key = f"{experts_prefix}.{exp_id}.{up_name}.weight"
down_w_key = f"{experts_prefix}.{exp_id}.{down_name}.weight"
gate_s_key = f"{experts_prefix}.{exp_id}.{gate_name}.weight_scale_inv"
up_s_key = f"{experts_prefix}.{exp_id}.{up_name}.weight_scale_inv"
down_s_key = f"{experts_prefix}.{exp_id}.{down_name}.weight_scale_inv"
gate_weights[exp_id] = self.load_tensor(gate_w_key, device).contiguous()
up_weights[exp_id] = self.load_tensor(up_w_key, device).contiguous()
down_weights[exp_id] = self.load_tensor(down_w_key, device).contiguous()
gate_scales[exp_id] = self.load_tensor(gate_s_key, device).contiguous()
up_scales[exp_id] = self.load_tensor(up_s_key, device).contiguous()
down_scales[exp_id] = self.load_tensor(down_s_key, device).contiguous()
return {
"gate": gate_weights,
"up": up_weights,
"down": down_weights,
"gate_scale": gate_scales,
"up_scale": up_scales,
"down_scale": down_scales,
}
class CompressedSafeTensorLoader(SafeTensorLoader):
"""Loader for compressed SafeTensor layouts (RAWINT4 weights)."""