* [feat]: add custom option for kt run

* [feat]: depth 3
This commit is contained in:
ErvinXie
2025-12-29 15:18:42 +08:00
committed by GitHub
parent 4b235cdaa4
commit 9539ab91eb
4 changed files with 382 additions and 151 deletions

View File

@@ -4,6 +4,8 @@ Doctor command for kt-cli.
Diagnoses environment issues and provides recommendations. Diagnoses environment issues and provides recommendations.
""" """
import glob
import os
import platform import platform
import shutil import shutil
from pathlib import Path from pathlib import Path
@@ -29,6 +31,67 @@ from kt_kernel.cli.utils.environment import (
) )
def _get_kt_kernel_info() -> dict:
"""Get kt-kernel installation information."""
info = {
"installed": False,
"version": None,
"cpu_variant": None,
"install_path": None,
"available_variants": [],
"extension_file": None,
}
try:
import kt_kernel
info["installed"] = True
info["version"] = getattr(kt_kernel, "__version__", "unknown")
info["cpu_variant"] = getattr(kt_kernel, "__cpu_variant__", "unknown")
# Get installation path
info["install_path"] = os.path.dirname(kt_kernel.__file__)
# Find available .so files
kt_kernel_dir = info["install_path"]
so_files = glob.glob(os.path.join(kt_kernel_dir, "_kt_kernel_ext_*.so"))
so_files.extend(glob.glob(os.path.join(kt_kernel_dir, "kt_kernel_ext*.so")))
# Parse variant names from filenames
variants = set()
for so_file in so_files:
basename = os.path.basename(so_file)
if "_kt_kernel_ext_" in basename:
# Extract variant from _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
parts = basename.split("_")
if len(parts) >= 4:
variant = parts[3] # "amx" from "_kt_kernel_ext_amx..."
if variant.startswith("avx"):
# Normalize avx variants
if variant in ["avx512", "avx512_bf16", "avx512_vbmi", "avx512_vnni", "avx512_base"]:
variants.add("avx512")
else:
variants.add(variant)
else:
variants.add(variant)
elif "kt_kernel_ext" in basename:
variants.add("default")
info["available_variants"] = sorted(list(variants))
# Get current extension file
if hasattr(kt_kernel, "kt_kernel_ext"):
ext_module = kt_kernel.kt_kernel_ext
info["extension_file"] = getattr(ext_module, "__file__", None)
except ImportError:
info["installed"] = False
except Exception as e:
info["error"] = str(e)
return info
def doctor( def doctor(
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed diagnostics"), verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed diagnostics"),
) -> None: ) -> None:
@@ -157,6 +220,76 @@ def doctor(
} }
) )
# 6b. kt-kernel installation check
kt_info = _get_kt_kernel_info()
if kt_info["installed"]:
# Build display string for kt-kernel
variant = kt_info["cpu_variant"]
version = kt_info["version"]
available_variants = kt_info["available_variants"]
# Determine status based on CPU variant
if variant == "amx":
kt_status = "ok"
kt_hint = "AMX variant loaded - optimal performance"
elif variant.startswith("avx512"):
kt_status = "ok"
kt_hint = "AVX512 variant loaded - good performance"
elif variant == "avx2":
kt_status = "warning"
kt_hint = "AVX2 variant - consider upgrading CPU for AMX/AVX512"
else:
kt_status = "warning"
kt_hint = f"Unknown variant: {variant}"
kt_value = f"v{version} ({variant.upper()})"
if verbose and available_variants:
kt_value += f" [dim] - available: {', '.join(available_variants)}[/dim]"
checks.append(
{
"name": "kt-kernel",
"status": kt_status,
"value": kt_value,
"hint": kt_hint,
}
)
# Show extension file path in verbose mode
if verbose and kt_info.get("extension_file"):
ext_file = os.path.basename(kt_info["extension_file"])
checks.append(
{
"name": " └─ Extension",
"status": "ok",
"value": ext_file,
"hint": None,
}
)
# Show installation path in verbose mode
if verbose and kt_info.get("install_path"):
checks.append(
{
"name": " └─ Path",
"status": "ok",
"value": kt_info["install_path"],
"hint": None,
}
)
else:
error_msg = kt_info.get("error", "Not installed")
checks.append(
{
"name": "kt-kernel",
"status": "error",
"value": error_msg,
"hint": "kt-kernel is required - run: pip install kt-kernel",
}
)
issues_found = True
# 7. System memory (with frequency if available) # 7. System memory (with frequency if available)
mem_info = detect_memory_info() mem_info = detect_memory_info()
if mem_info.frequency_mhz and mem_info.type: if mem_info.frequency_mhz and mem_info.type:
@@ -204,7 +337,6 @@ def doctor(
# 6. Required packages # 6. Required packages
packages = [ packages = [
("kt-kernel", ">=0.4.0", False), # name, version_req, required ("kt-kernel", ">=0.4.0", False), # name, version_req, required
("ktransformers", ">=0.4.0", False),
("sglang", ">=0.4.0", False), ("sglang", ">=0.4.0", False),
("torch", ">=2.4.0", True), ("torch", ">=2.4.0", True),
("transformers", ">=4.45.0", True), ("transformers", ">=4.45.0", True),

View File

@@ -10,6 +10,7 @@ import sys
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import click
import typer import typer
from kt_kernel.cli.config.settings import get_settings from kt_kernel.cli.config.settings import get_settings
@@ -30,128 +31,163 @@ from kt_kernel.cli.utils.environment import detect_cpu_info, detect_gpus, detect
from kt_kernel.cli.utils.model_registry import MODEL_COMPUTE_FUNCTIONS, ModelInfo, get_registry from kt_kernel.cli.utils.model_registry import MODEL_COMPUTE_FUNCTIONS, ModelInfo, get_registry
@click.command(
context_settings={"ignore_unknown_options": True, "allow_extra_args": True},
add_help_option=False, # We'll handle help manually to avoid conflicts
)
@click.argument("model", required=False, default=None)
@click.option("--host", "-H", default=None, help="Server host address")
@click.option("--port", "-p", type=int, default=None, help="Server port")
@click.option("--gpu-experts", type=int, default=None, help="Number of GPU experts per layer")
@click.option("--cpu-threads", type=int, default=None, help="Number of CPU inference threads")
@click.option("--numa-nodes", type=int, default=None, help="Number of NUMA nodes")
@click.option(
"--tensor-parallel-size", "--tp", "tensor_parallel_size", type=int, default=None, help="Tensor parallel size"
)
@click.option("--model-path", type=click.Path(), default=None, help="Custom model path")
@click.option("--weights-path", type=click.Path(), default=None, help="Custom quantized weights path")
@click.option("--kt-method", default=None, help="KT quantization method")
@click.option(
"--kt-gpu-prefill-threshold", "kt_gpu_prefill_threshold", type=int, default=None, help="GPU prefill token threshold"
)
@click.option("--attention-backend", default=None, help="Attention backend")
@click.option("--max-total-tokens", "max_total_tokens", type=int, default=None, help="Maximum total tokens")
@click.option("--max-running-requests", "max_running_requests", type=int, default=None, help="Maximum running requests")
@click.option("--chunked-prefill-size", "chunked_prefill_size", type=int, default=None, help="Chunked prefill size")
@click.option("--mem-fraction-static", "mem_fraction_static", type=float, default=None, help="Memory fraction static")
@click.option("--watchdog-timeout", "watchdog_timeout", type=int, default=None, help="Watchdog timeout")
@click.option("--served-model-name", "served_model_name", default=None, help="Served model name")
@click.option(
"--disable-shared-experts-fusion",
"disable_shared_experts_fusion",
is_flag=True,
default=None,
help="Disable shared experts fusion",
)
@click.option(
"--enable-shared-experts-fusion",
"enable_shared_experts_fusion",
is_flag=True,
default=False,
help="Enable shared experts fusion",
)
@click.option("--quantize", "-q", is_flag=True, default=False, help="Quantize model")
@click.option("--advanced", is_flag=True, default=False, help="Show advanced options")
@click.option("--dry-run", "dry_run", is_flag=True, default=False, help="Show command without executing")
@click.pass_context
def run( def run(
model: Optional[str] = typer.Argument( ctx: click.Context,
None, model: Optional[str],
help="Model name or path (e.g., deepseek-v3, qwen3-30b). If not specified, shows interactive selection.", host: Optional[str],
), port: Optional[int],
host: str = typer.Option( gpu_experts: Optional[int],
None, cpu_threads: Optional[int],
"--host", numa_nodes: Optional[int],
"-H", tensor_parallel_size: Optional[int],
help="Server host address", model_path: Optional[str],
), weights_path: Optional[str],
port: int = typer.Option( kt_method: Optional[str],
None, kt_gpu_prefill_threshold: Optional[int],
"--port", attention_backend: Optional[str],
"-p", max_total_tokens: Optional[int],
help="Server port", max_running_requests: Optional[int],
), chunked_prefill_size: Optional[int],
# CPU/GPU configuration mem_fraction_static: Optional[float],
gpu_experts: Optional[int] = typer.Option( watchdog_timeout: Optional[int],
None, served_model_name: Optional[str],
"--gpu-experts", disable_shared_experts_fusion: Optional[bool],
help="Number of GPU experts per layer", enable_shared_experts_fusion: bool,
), quantize: bool,
cpu_threads: Optional[int] = typer.Option( advanced: bool,
None, dry_run: bool,
"--cpu-threads",
help="Number of CPU inference threads (kt-cpuinfer, defaults to 80% of CPU cores)",
),
numa_nodes: Optional[int] = typer.Option(
None,
"--numa-nodes",
help="Number of NUMA nodes",
),
tensor_parallel_size: Optional[int] = typer.Option(
None,
"--tensor-parallel-size",
"--tp",
help="Tensor parallel size (number of GPUs)",
),
# Model paths
model_path: Optional[Path] = typer.Option(
None,
"--model-path",
help="Custom model path",
),
weights_path: Optional[Path] = typer.Option(
None,
"--weights-path",
help="Custom quantized weights path",
),
# KT-kernel options
kt_method: Optional[str] = typer.Option(
None,
"--kt-method",
help="KT quantization method (AMXINT4, RAWFP8, etc.)",
),
kt_gpu_prefill_token_threshold: Optional[int] = typer.Option(
None,
"--kt-gpu-prefill-threshold",
help="GPU prefill token threshold for kt-kernel",
),
# SGLang options
attention_backend: Optional[str] = typer.Option(
None,
"--attention-backend",
help="Attention backend (triton, flashinfer)",
),
max_total_tokens: Optional[int] = typer.Option(
None,
"--max-total-tokens",
help="Maximum total tokens",
),
max_running_requests: Optional[int] = typer.Option(
None,
"--max-running-requests",
help="Maximum running requests",
),
chunked_prefill_size: Optional[int] = typer.Option(
None,
"--chunked-prefill-size",
help="Chunked prefill size",
),
mem_fraction_static: Optional[float] = typer.Option(
None,
"--mem-fraction-static",
help="Memory fraction for static allocation",
),
watchdog_timeout: Optional[int] = typer.Option(
None,
"--watchdog-timeout",
help="Watchdog timeout in seconds",
),
served_model_name: Optional[str] = typer.Option(
None,
"--served-model-name",
help="Custom model name for API responses",
),
# Performance flags
disable_shared_experts_fusion: Optional[bool] = typer.Option(
None,
"--disable-shared-experts-fusion/--enable-shared-experts-fusion",
help="Disable/enable shared experts fusion",
),
# Other options
quantize: bool = typer.Option(
False,
"--quantize",
"-q",
help="Quantize model if weights not found",
),
advanced: bool = typer.Option(
False,
"--advanced",
help="Show advanced options",
),
dry_run: bool = typer.Option(
False,
"--dry-run",
help="Show command without executing",
),
) -> None: ) -> None:
"""Start model inference server.""" """Start model inference server.
\b
Examples: kt run deepseek-v3 | kt run m2 --tensor-parallel-size 2 | kt run /path/to/model --gpu-experts 4
\b
Custom Options: Pass any SGLang server option directly (e.g., kt run m2 --fp8-gemm-backend triton).
Common: --fp8-gemm-backend, --tool-call-parser, --reasoning-parser, --dp-size, --enable-ma
For full list: python -m sglang.launch_server --help
"""
# Handle --help manually since we disabled it
# Check sys.argv for --help or -h since ctx.args may not be set yet
if "--help" in sys.argv or "-h" in sys.argv:
click.echo(ctx.get_help())
return
# Handle disable/enable shared experts fusion flags
if enable_shared_experts_fusion:
disable_shared_experts_fusion = False
elif disable_shared_experts_fusion is None:
disable_shared_experts_fusion = None
# Convert Path objects from click
model_path_obj = Path(model_path) if model_path else None
weights_path_obj = Path(weights_path) if weights_path else None
# Get extra args that weren't parsed (unknown options)
# click stores these in ctx.args when ignore_unknown_options=True
extra_cli_args = list(ctx.args) if ctx.args else []
# Remove --help from extra args if present (already handled)
extra_cli_args = [arg for arg in extra_cli_args if arg not in ["--help", "-h"]]
# Call the actual run function implementation
_run_impl(
model=model,
host=host,
port=port,
gpu_experts=gpu_experts,
cpu_threads=cpu_threads,
numa_nodes=numa_nodes,
tensor_parallel_size=tensor_parallel_size,
model_path=model_path_obj,
weights_path=weights_path_obj,
kt_method=kt_method,
kt_gpu_prefill_threshold=kt_gpu_prefill_threshold,
attention_backend=attention_backend,
max_total_tokens=max_total_tokens,
max_running_requests=max_running_requests,
chunked_prefill_size=chunked_prefill_size,
mem_fraction_static=mem_fraction_static,
watchdog_timeout=watchdog_timeout,
served_model_name=served_model_name,
disable_shared_experts_fusion=disable_shared_experts_fusion,
quantize=quantize,
advanced=advanced,
dry_run=dry_run,
extra_cli_args=extra_cli_args,
)
def _run_impl(
model: Optional[str],
host: Optional[str],
port: Optional[int],
gpu_experts: Optional[int],
cpu_threads: Optional[int],
numa_nodes: Optional[int],
tensor_parallel_size: Optional[int],
model_path: Optional[Path],
weights_path: Optional[Path],
kt_method: Optional[str],
kt_gpu_prefill_threshold: Optional[int],
attention_backend: Optional[str],
max_total_tokens: Optional[int],
max_running_requests: Optional[int],
chunked_prefill_size: Optional[int],
mem_fraction_static: Optional[float],
watchdog_timeout: Optional[int],
served_model_name: Optional[str],
disable_shared_experts_fusion: Optional[bool],
quantize: bool,
advanced: bool,
dry_run: bool,
extra_cli_args: list[str],
) -> None:
"""Actual implementation of run command."""
# Check if SGLang is installed before proceeding # Check if SGLang is installed before proceeding
from kt_kernel.cli.utils.sglang_checker import ( from kt_kernel.cli.utils.sglang_checker import (
check_sglang_installation, check_sglang_installation,
@@ -387,7 +423,7 @@ def run(
# KT-kernel options # KT-kernel options
final_kt_method = kt_method or model_defaults.get("kt-method") or settings.get("inference.kt_method", "AMXINT4") final_kt_method = kt_method or model_defaults.get("kt-method") or settings.get("inference.kt_method", "AMXINT4")
final_kt_gpu_prefill_threshold = ( final_kt_gpu_prefill_threshold = (
kt_gpu_prefill_token_threshold kt_gpu_prefill_threshold
or model_defaults.get("kt-gpu-prefill-token-threshold") or model_defaults.get("kt-gpu-prefill-token-threshold")
or settings.get("inference.kt_gpu_prefill_token_threshold", 4096) or settings.get("inference.kt_gpu_prefill_token_threshold", 4096)
) )
@@ -456,6 +492,7 @@ def run(
disable_shared_experts_fusion=final_disable_shared_experts_fusion, disable_shared_experts_fusion=final_disable_shared_experts_fusion,
settings=settings, settings=settings,
extra_model_params=extra_params, extra_model_params=extra_params,
extra_cli_args=extra_cli_args,
) )
# Prepare environment variables # Prepare environment variables
@@ -535,29 +572,51 @@ def run(
raise typer.Exit(1) raise typer.Exit(1)
def _find_model_path(model_info: ModelInfo, settings) -> Optional[Path]: def _find_model_path(model_info: ModelInfo, settings, max_depth: int = 3) -> Optional[Path]:
"""Find the model path on disk by searching all configured model paths.""" """Find the model path on disk by searching all configured model paths.
Args:
model_info: Model information to search for
settings: Settings instance
max_depth: Maximum depth to search within each model path (default: 3)
Returns:
Path to the model directory, or None if not found
"""
model_paths = settings.get_model_paths() model_paths = settings.get_model_paths()
# Generate possible names to search for
possible_names = [
model_info.name,
model_info.name.lower(),
model_info.name.replace(" ", "-"),
model_info.hf_repo.split("/")[-1],
model_info.hf_repo.replace("/", "--"),
]
# Add alias-based names
for alias in model_info.aliases:
possible_names.append(alias)
possible_names.append(alias.lower())
# Search in all configured model directories # Search in all configured model directories
for models_dir in model_paths: for models_dir in model_paths:
# Check common path patterns if not models_dir.exists():
possible_paths = [ continue
models_dir / model_info.name,
models_dir / model_info.name.lower(),
models_dir / model_info.name.replace(" ", "-"),
models_dir / model_info.hf_repo.split("/")[-1],
models_dir / model_info.hf_repo.replace("/", "--"),
]
# Add alias-based paths # Search recursively up to max_depth
for alias in model_info.aliases: for depth in range(max_depth):
possible_paths.append(models_dir / alias) for name in possible_names:
possible_paths.append(models_dir / alias.lower()) if depth == 0:
# Direct children: models_dir / name
search_paths = [models_dir / name]
else:
# Nested: use rglob to find directories matching the name
search_paths = list(models_dir.rglob(name))
for path in possible_paths: for path in search_paths:
if path.exists() and (path / "config.json").exists(): if path.exists() and (path / "config.json").exists():
return path return path
return None return None
@@ -613,6 +672,7 @@ def _build_sglang_command(
disable_shared_experts_fusion: bool, disable_shared_experts_fusion: bool,
settings, settings,
extra_model_params: Optional[dict] = None, # New parameter for additional params extra_model_params: Optional[dict] = None, # New parameter for additional params
extra_cli_args: Optional[list[str]] = None, # Extra args from CLI to pass to sglang
) -> list[str]: ) -> list[str]:
"""Build the SGLang launch command.""" """Build the SGLang launch command."""
cmd = [ cmd = [
@@ -734,6 +794,10 @@ def _build_sglang_command(
if extra_args: if extra_args:
cmd.extend(extra_args) cmd.extend(extra_args)
# Add extra CLI args (user-provided options not defined in kt CLI)
if extra_cli_args:
cmd.extend(extra_cli_args)
return cmd return cmd

View File

@@ -68,7 +68,8 @@ def _update_help_texts() -> None:
# Register commands # Register commands
app.command(name="version", help="Show version information")(version.version) app.command(name="version", help="Show version information")(version.version)
app.command(name="run", help="Start model inference server")(run.run) # Run command is handled specially in main() to allow extra args
# (not registered here to avoid typer's argument parsing)
app.command(name="chat", help="Interactive chat with running model")(chat.chat) app.command(name="chat", help="Interactive chat with running model")(chat.chat)
app.command(name="quant", help="Quantize model weights")(quant.quant) app.command(name="quant", help="Quantize model weights")(quant.quant)
app.command(name="bench", help="Run full benchmark")(bench.bench) app.command(name="bench", help="Run full benchmark")(bench.bench)
@@ -429,6 +430,15 @@ def main():
if should_check_first_run and args: if should_check_first_run and args:
check_first_run() check_first_run()
# Handle "run" command specially to pass through unknown options
if args and args[0] == "run":
# Get args after "run"
run_args = args[1:]
# Use click command directly with ignore_unknown_options
from kt_kernel.cli.commands import run as run_module
sys.exit(run_module.run.main(args=run_args, standalone_mode=False))
app() app()

View File

@@ -280,9 +280,12 @@ class ModelRegistry:
"""List all registered models.""" """List all registered models."""
return list(self._models.values()) return list(self._models.values())
def find_local_models(self) -> list[tuple[ModelInfo, Path]]: def find_local_models(self, max_depth: int = 3) -> list[tuple[ModelInfo, Path]]:
"""Find models that are downloaded locally in any configured model path. """Find models that are downloaded locally in any configured model path.
Args:
max_depth: Maximum depth to search within each model path (default: 3)
Returns: Returns:
List of (ModelInfo, path) tuples for local models List of (ModelInfo, path) tuples for local models
""" """
@@ -297,18 +300,40 @@ class ModelRegistry:
if not models_dir.exists(): if not models_dir.exists():
continue continue
# Check common path patterns # Generate possible names to search for
possible_paths = [ possible_names = [
models_dir / model.name, model.name,
models_dir / model.name.lower(), model.name.lower(),
models_dir / model.hf_repo.split("/")[-1], model.hf_repo.split("/")[-1],
models_dir / model.hf_repo.replace("/", "--"), model.hf_repo.replace("/", "--"),
] ]
for path in possible_paths: # Search recursively up to max_depth
if path.exists() and (path / "config.json").exists(): for depth in range(max_depth):
results.append((model, path)) # Build glob pattern for current depth
found = True # depth=0: direct children, depth=1: grandchildren, etc.
glob_pattern = "*" if depth > 0 else ""
for _ in range(depth):
glob_pattern = "*/" + glob_pattern if glob_pattern else "*"
for name in possible_names:
if depth == 0:
# Direct children: models_dir / name
search_paths = [models_dir / name]
else:
# Nested: use rglob to find directories matching the name
search_paths = list(models_dir.rglob(name))
for path in search_paths:
if path.exists() and (path / "config.json").exists():
results.append((model, path))
found = True
break
if found:
break
if found:
break break
if found: if found: