Cli (#1765)

* [feat]: add custom option for kt run * [feat]: depth 3
2026-04-30 11:11:19 +00:00 · 2025-12-29 15:18:42 +08:00
parent 4b235cdaa4
commit 9539ab91eb
4 changed files with 382 additions and 151 deletions
--- a/kt-kernel/python/cli/commands/doctor.py
+++ b/kt-kernel/python/cli/commands/doctor.py
@@ -4,6 +4,8 @@ Doctor command for kt-cli.
 Diagnoses environment issues and provides recommendations.
 """
 import glob
 import os
 import platform
 import shutil
 from pathlib import Path
@@ -29,6 +31,67 @@ from kt_kernel.cli.utils.environment import (
 )
 def _get_kt_kernel_info() -> dict:
    """Get kt-kernel installation information."""
    info = {
        "installed": False,
        "version": None,
        "cpu_variant": None,
        "install_path": None,
        "available_variants": [],
        "extension_file": None,
    }
    try:
        import kt_kernel
        info["installed"] = True
        info["version"] = getattr(kt_kernel, "__version__", "unknown")
        info["cpu_variant"] = getattr(kt_kernel, "__cpu_variant__", "unknown")
        # Get installation path
        info["install_path"] = os.path.dirname(kt_kernel.__file__)
        # Find available .so files
        kt_kernel_dir = info["install_path"]
        so_files = glob.glob(os.path.join(kt_kernel_dir, "_kt_kernel_ext_*.so"))
        so_files.extend(glob.glob(os.path.join(kt_kernel_dir, "kt_kernel_ext*.so")))
        # Parse variant names from filenames
        variants = set()
        for so_file in so_files:
            basename = os.path.basename(so_file)
            if "_kt_kernel_ext_" in basename:
                # Extract variant from _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
                parts = basename.split("_")
                if len(parts) >= 4:
                    variant = parts[3]  # "amx" from "_kt_kernel_ext_amx..."
                    if variant.startswith("avx"):
                        # Normalize avx variants
                        if variant in ["avx512", "avx512_bf16", "avx512_vbmi", "avx512_vnni", "avx512_base"]:
                            variants.add("avx512")
                        else:
                            variants.add(variant)
                    else:
                        variants.add(variant)
            elif "kt_kernel_ext" in basename:
                variants.add("default")
        info["available_variants"] = sorted(list(variants))
        # Get current extension file
        if hasattr(kt_kernel, "kt_kernel_ext"):
            ext_module = kt_kernel.kt_kernel_ext
            info["extension_file"] = getattr(ext_module, "__file__", None)
    except ImportError:
        info["installed"] = False
    except Exception as e:
        info["error"] = str(e)
    return info
 def doctor(
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed diagnostics"),
 ) -> None:
@@ -157,6 +220,76 @@ def doctor(
        }
    )
    # 6b. kt-kernel installation check
    kt_info = _get_kt_kernel_info()
    if kt_info["installed"]:
        # Build display string for kt-kernel
        variant = kt_info["cpu_variant"]
        version = kt_info["version"]
        available_variants = kt_info["available_variants"]
        # Determine status based on CPU variant
        if variant == "amx":
            kt_status = "ok"
            kt_hint = "AMX variant loaded - optimal performance"
        elif variant.startswith("avx512"):
            kt_status = "ok"
            kt_hint = "AVX512 variant loaded - good performance"
        elif variant == "avx2":
            kt_status = "warning"
            kt_hint = "AVX2 variant - consider upgrading CPU for AMX/AVX512"
        else:
            kt_status = "warning"
            kt_hint = f"Unknown variant: {variant}"
        kt_value = f"v{version} ({variant.upper()})"
        if verbose and available_variants:
            kt_value += f" [dim] - available: {', '.join(available_variants)}[/dim]"
        checks.append(
            {
                "name": "kt-kernel",
                "status": kt_status,
                "value": kt_value,
                "hint": kt_hint,
            }
        )
        # Show extension file path in verbose mode
        if verbose and kt_info.get("extension_file"):
            ext_file = os.path.basename(kt_info["extension_file"])
            checks.append(
                {
                    "name": "  └─ Extension",
                    "status": "ok",
                    "value": ext_file,
                    "hint": None,
                }
            )
        # Show installation path in verbose mode
        if verbose and kt_info.get("install_path"):
            checks.append(
                {
                    "name": "  └─ Path",
                    "status": "ok",
                    "value": kt_info["install_path"],
                    "hint": None,
                }
            )
    else:
        error_msg = kt_info.get("error", "Not installed")
        checks.append(
            {
                "name": "kt-kernel",
                "status": "error",
                "value": error_msg,
                "hint": "kt-kernel is required - run: pip install kt-kernel",
            }
        )
        issues_found = True
    # 7. System memory (with frequency if available)
    mem_info = detect_memory_info()
    if mem_info.frequency_mhz and mem_info.type:
@@ -204,7 +337,6 @@ def doctor(
    # 6. Required packages
    packages = [
        ("kt-kernel", ">=0.4.0", False),  # name, version_req, required
        ("ktransformers", ">=0.4.0", False),
        ("sglang", ">=0.4.0", False),
        ("torch", ">=2.4.0", True),
        ("transformers", ">=4.45.0", True),
--- a/kt-kernel/python/cli/commands/run.py
+++ b/kt-kernel/python/cli/commands/run.py
@@ -10,6 +10,7 @@ import sys
 from pathlib import Path
 from typing import Optional
 import click
 import typer
 from kt_kernel.cli.config.settings import get_settings
@@ -30,128 +31,163 @@ from kt_kernel.cli.utils.environment import detect_cpu_info, detect_gpus, detect
 from kt_kernel.cli.utils.model_registry import MODEL_COMPUTE_FUNCTIONS, ModelInfo, get_registry
@click.command(
    context_settings={"ignore_unknown_options": True, "allow_extra_args": True},
    add_help_option=False,  # We'll handle help manually to avoid conflicts
 )
@click.argument("model", required=False, default=None)
@click.option("--host", "-H", default=None, help="Server host address")
@click.option("--port", "-p", type=int, default=None, help="Server port")
@click.option("--gpu-experts", type=int, default=None, help="Number of GPU experts per layer")
@click.option("--cpu-threads", type=int, default=None, help="Number of CPU inference threads")
@click.option("--numa-nodes", type=int, default=None, help="Number of NUMA nodes")
@click.option(
    "--tensor-parallel-size", "--tp", "tensor_parallel_size", type=int, default=None, help="Tensor parallel size"
 )
@click.option("--model-path", type=click.Path(), default=None, help="Custom model path")
@click.option("--weights-path", type=click.Path(), default=None, help="Custom quantized weights path")
@click.option("--kt-method", default=None, help="KT quantization method")
@click.option(
    "--kt-gpu-prefill-threshold", "kt_gpu_prefill_threshold", type=int, default=None, help="GPU prefill token threshold"
 )
@click.option("--attention-backend", default=None, help="Attention backend")
@click.option("--max-total-tokens", "max_total_tokens", type=int, default=None, help="Maximum total tokens")
@click.option("--max-running-requests", "max_running_requests", type=int, default=None, help="Maximum running requests")
@click.option("--chunked-prefill-size", "chunked_prefill_size", type=int, default=None, help="Chunked prefill size")
@click.option("--mem-fraction-static", "mem_fraction_static", type=float, default=None, help="Memory fraction static")
@click.option("--watchdog-timeout", "watchdog_timeout", type=int, default=None, help="Watchdog timeout")
@click.option("--served-model-name", "served_model_name", default=None, help="Served model name")
@click.option(
    "--disable-shared-experts-fusion",
    "disable_shared_experts_fusion",
    is_flag=True,
    default=None,
    help="Disable shared experts fusion",
 )
@click.option(
    "--enable-shared-experts-fusion",
    "enable_shared_experts_fusion",
    is_flag=True,
    default=False,
    help="Enable shared experts fusion",
 )
@click.option("--quantize", "-q", is_flag=True, default=False, help="Quantize model")
@click.option("--advanced", is_flag=True, default=False, help="Show advanced options")
@click.option("--dry-run", "dry_run", is_flag=True, default=False, help="Show command without executing")
@click.pass_context
 def run(
-    model: Optional[str] = typer.Argument(
+    ctx: click.Context,
-        None,
+    model: Optional[str],
-        help="Model name or path (e.g., deepseek-v3, qwen3-30b). If not specified, shows interactive selection.",
+    host: Optional[str],
-    ),
+    port: Optional[int],
-    host: str = typer.Option(
+    gpu_experts: Optional[int],
-        None,
+    cpu_threads: Optional[int],
-        "--host",
+    numa_nodes: Optional[int],
-        "-H",
+    tensor_parallel_size: Optional[int],
-        help="Server host address",
+    model_path: Optional[str],
-    ),
+    weights_path: Optional[str],
-    port: int = typer.Option(
+    kt_method: Optional[str],
-        None,
+    kt_gpu_prefill_threshold: Optional[int],
-        "--port",
+    attention_backend: Optional[str],
-        "-p",
+    max_total_tokens: Optional[int],
-        help="Server port",
+    max_running_requests: Optional[int],
-    ),
+    chunked_prefill_size: Optional[int],
-    # CPU/GPU configuration
+    mem_fraction_static: Optional[float],
-    gpu_experts: Optional[int] = typer.Option(
+    watchdog_timeout: Optional[int],
-        None,
+    served_model_name: Optional[str],
-        "--gpu-experts",
+    disable_shared_experts_fusion: Optional[bool],
-        help="Number of GPU experts per layer",
+    enable_shared_experts_fusion: bool,
-    ),
+    quantize: bool,
-    cpu_threads: Optional[int] = typer.Option(
+    advanced: bool,
-        None,
+    dry_run: bool,
        "--cpu-threads",
        help="Number of CPU inference threads (kt-cpuinfer, defaults to 80% of CPU cores)",
    ),
    numa_nodes: Optional[int] = typer.Option(
        None,
        "--numa-nodes",
        help="Number of NUMA nodes",
    ),
    tensor_parallel_size: Optional[int] = typer.Option(
        None,
        "--tensor-parallel-size",
        "--tp",
        help="Tensor parallel size (number of GPUs)",
    ),
    # Model paths
    model_path: Optional[Path] = typer.Option(
        None,
        "--model-path",
        help="Custom model path",
    ),
    weights_path: Optional[Path] = typer.Option(
        None,
        "--weights-path",
        help="Custom quantized weights path",
    ),
    # KT-kernel options
    kt_method: Optional[str] = typer.Option(
        None,
        "--kt-method",
        help="KT quantization method (AMXINT4, RAWFP8, etc.)",
    ),
    kt_gpu_prefill_token_threshold: Optional[int] = typer.Option(
        None,
        "--kt-gpu-prefill-threshold",
        help="GPU prefill token threshold for kt-kernel",
    ),
    # SGLang options
    attention_backend: Optional[str] = typer.Option(
        None,
        "--attention-backend",
        help="Attention backend (triton, flashinfer)",
    ),
    max_total_tokens: Optional[int] = typer.Option(
        None,
        "--max-total-tokens",
        help="Maximum total tokens",
    ),
    max_running_requests: Optional[int] = typer.Option(
        None,
        "--max-running-requests",
        help="Maximum running requests",
    ),
    chunked_prefill_size: Optional[int] = typer.Option(
        None,
        "--chunked-prefill-size",
        help="Chunked prefill size",
    ),
    mem_fraction_static: Optional[float] = typer.Option(
        None,
        "--mem-fraction-static",
        help="Memory fraction for static allocation",
    ),
    watchdog_timeout: Optional[int] = typer.Option(
        None,
        "--watchdog-timeout",
        help="Watchdog timeout in seconds",
    ),
    served_model_name: Optional[str] = typer.Option(
        None,
        "--served-model-name",
        help="Custom model name for API responses",
    ),
    # Performance flags
    disable_shared_experts_fusion: Optional[bool] = typer.Option(
        None,
        "--disable-shared-experts-fusion/--enable-shared-experts-fusion",
        help="Disable/enable shared experts fusion",
    ),
    # Other options
    quantize: bool = typer.Option(
        False,
        "--quantize",
        "-q",
        help="Quantize model if weights not found",
    ),
    advanced: bool = typer.Option(
        False,
        "--advanced",
        help="Show advanced options",
    ),
    dry_run: bool = typer.Option(
        False,
        "--dry-run",
        help="Show command without executing",
    ),
 ) -> None:
-    """Start model inference server."""
+    """Start model inference server.
    \b
    Examples: kt run deepseek-v3 | kt run m2 --tensor-parallel-size 2 | kt run /path/to/model --gpu-experts 4
    \b
    Custom Options: Pass any SGLang server option directly (e.g., kt run m2 --fp8-gemm-backend triton).
    Common: --fp8-gemm-backend, --tool-call-parser, --reasoning-parser, --dp-size, --enable-ma
    For full list: python -m sglang.launch_server --help
    """
    # Handle --help manually since we disabled it
    # Check sys.argv for --help or -h since ctx.args may not be set yet
    if "--help" in sys.argv or "-h" in sys.argv:
        click.echo(ctx.get_help())
        return
    # Handle disable/enable shared experts fusion flags
    if enable_shared_experts_fusion:
        disable_shared_experts_fusion = False
    elif disable_shared_experts_fusion is None:
        disable_shared_experts_fusion = None
    # Convert Path objects from click
    model_path_obj = Path(model_path) if model_path else None
    weights_path_obj = Path(weights_path) if weights_path else None
    # Get extra args that weren't parsed (unknown options)
    # click stores these in ctx.args when ignore_unknown_options=True
    extra_cli_args = list(ctx.args) if ctx.args else []
    # Remove --help from extra args if present (already handled)
    extra_cli_args = [arg for arg in extra_cli_args if arg not in ["--help", "-h"]]
    # Call the actual run function implementation
    _run_impl(
        model=model,
        host=host,
        port=port,
        gpu_experts=gpu_experts,
        cpu_threads=cpu_threads,
        numa_nodes=numa_nodes,
        tensor_parallel_size=tensor_parallel_size,
        model_path=model_path_obj,
        weights_path=weights_path_obj,
        kt_method=kt_method,
        kt_gpu_prefill_threshold=kt_gpu_prefill_threshold,
        attention_backend=attention_backend,
        max_total_tokens=max_total_tokens,
        max_running_requests=max_running_requests,
        chunked_prefill_size=chunked_prefill_size,
        mem_fraction_static=mem_fraction_static,
        watchdog_timeout=watchdog_timeout,
        served_model_name=served_model_name,
        disable_shared_experts_fusion=disable_shared_experts_fusion,
        quantize=quantize,
        advanced=advanced,
        dry_run=dry_run,
        extra_cli_args=extra_cli_args,
    )
 def _run_impl(
    model: Optional[str],
    host: Optional[str],
    port: Optional[int],
    gpu_experts: Optional[int],
    cpu_threads: Optional[int],
    numa_nodes: Optional[int],
    tensor_parallel_size: Optional[int],
    model_path: Optional[Path],
    weights_path: Optional[Path],
    kt_method: Optional[str],
    kt_gpu_prefill_threshold: Optional[int],
    attention_backend: Optional[str],
    max_total_tokens: Optional[int],
    max_running_requests: Optional[int],
    chunked_prefill_size: Optional[int],
    mem_fraction_static: Optional[float],
    watchdog_timeout: Optional[int],
    served_model_name: Optional[str],
    disable_shared_experts_fusion: Optional[bool],
    quantize: bool,
    advanced: bool,
    dry_run: bool,
    extra_cli_args: list[str],
 ) -> None:
    """Actual implementation of run command."""
    # Check if SGLang is installed before proceeding
    from kt_kernel.cli.utils.sglang_checker import (
        check_sglang_installation,
@@ -387,7 +423,7 @@ def run(
    # KT-kernel options
    final_kt_method = kt_method or model_defaults.get("kt-method") or settings.get("inference.kt_method", "AMXINT4")
    final_kt_gpu_prefill_threshold = (
-        kt_gpu_prefill_token_threshold
+        kt_gpu_prefill_threshold
        or model_defaults.get("kt-gpu-prefill-token-threshold")
        or settings.get("inference.kt_gpu_prefill_token_threshold", 4096)
    )
@@ -456,6 +492,7 @@ def run(
        disable_shared_experts_fusion=final_disable_shared_experts_fusion,
        settings=settings,
        extra_model_params=extra_params,
        extra_cli_args=extra_cli_args,
    )
    # Prepare environment variables
@@ -535,29 +572,51 @@ def run(
        raise typer.Exit(1)
-def _find_model_path(model_info: ModelInfo, settings) -> Optional[Path]:
+def _find_model_path(model_info: ModelInfo, settings, max_depth: int = 3) -> Optional[Path]:
-    """Find the model path on disk by searching all configured model paths."""
+    """Find the model path on disk by searching all configured model paths.
    Args:
        model_info: Model information to search for
        settings: Settings instance
        max_depth: Maximum depth to search within each model path (default: 3)
    Returns:
        Path to the model directory, or None if not found
    """
    model_paths = settings.get_model_paths()
    # Generate possible names to search for
    possible_names = [
        model_info.name,
        model_info.name.lower(),
        model_info.name.replace(" ", "-"),
        model_info.hf_repo.split("/")[-1],
        model_info.hf_repo.replace("/", "--"),
    ]
    # Add alias-based names
    for alias in model_info.aliases:
        possible_names.append(alias)
        possible_names.append(alias.lower())
    # Search in all configured model directories
    for models_dir in model_paths:
-        # Check common path patterns
+        if not models_dir.exists():
-        possible_paths = [
+            continue
            models_dir / model_info.name,
            models_dir / model_info.name.lower(),
            models_dir / model_info.name.replace(" ", "-"),
            models_dir / model_info.hf_repo.split("/")[-1],
            models_dir / model_info.hf_repo.replace("/", "--"),
        ]
-        # Add alias-based paths
+        # Search recursively up to max_depth
-        for alias in model_info.aliases:
+        for depth in range(max_depth):
-            possible_paths.append(models_dir / alias)
+            for name in possible_names:
-            possible_paths.append(models_dir / alias.lower())
+                if depth == 0:
                    # Direct children: models_dir / name
                    search_paths = [models_dir / name]
                else:
                    # Nested: use rglob to find directories matching the name
                    search_paths = list(models_dir.rglob(name))
-        for path in possible_paths:
+                for path in search_paths:
-            if path.exists() and (path / "config.json").exists():
+                    if path.exists() and (path / "config.json").exists():
-                return path
+                        return path
    return None
@@ -613,6 +672,7 @@ def _build_sglang_command(
    disable_shared_experts_fusion: bool,
    settings,
    extra_model_params: Optional[dict] = None,  # New parameter for additional params
    extra_cli_args: Optional[list[str]] = None,  # Extra args from CLI to pass to sglang
 ) -> list[str]:
    """Build the SGLang launch command."""
    cmd = [
@@ -734,6 +794,10 @@ def _build_sglang_command(
    if extra_args:
        cmd.extend(extra_args)
    # Add extra CLI args (user-provided options not defined in kt CLI)
    if extra_cli_args:
        cmd.extend(extra_cli_args)
    return cmd
--- a/kt-kernel/python/cli/main.py
+++ b/kt-kernel/python/cli/main.py
@@ -68,7 +68,8 @@ def _update_help_texts() -> None:
 # Register commands
 app.command(name="version", help="Show version information")(version.version)
-app.command(name="run", help="Start model inference server")(run.run)
+# Run command is handled specially in main() to allow extra args
 # (not registered here to avoid typer's argument parsing)
 app.command(name="chat", help="Interactive chat with running model")(chat.chat)
 app.command(name="quant", help="Quantize model weights")(quant.quant)
 app.command(name="bench", help="Run full benchmark")(bench.bench)
@@ -429,6 +430,15 @@ def main():
    if should_check_first_run and args:
        check_first_run()
    # Handle "run" command specially to pass through unknown options
    if args and args[0] == "run":
        # Get args after "run"
        run_args = args[1:]
        # Use click command directly with ignore_unknown_options
        from kt_kernel.cli.commands import run as run_module
        sys.exit(run_module.run.main(args=run_args, standalone_mode=False))
    app()
--- a/kt-kernel/python/cli/utils/model_registry.py
+++ b/kt-kernel/python/cli/utils/model_registry.py
@@ -280,9 +280,12 @@ class ModelRegistry:
        """List all registered models."""
        return list(self._models.values())
-    def find_local_models(self) -> list[tuple[ModelInfo, Path]]:
+    def find_local_models(self, max_depth: int = 3) -> list[tuple[ModelInfo, Path]]:
        """Find models that are downloaded locally in any configured model path.
        Args:
            max_depth: Maximum depth to search within each model path (default: 3)
        Returns:
            List of (ModelInfo, path) tuples for local models
        """
@@ -297,18 +300,40 @@ class ModelRegistry:
                if not models_dir.exists():
                    continue
-                # Check common path patterns
+                # Generate possible names to search for
-                possible_paths = [
+                possible_names = [
-                    models_dir / model.name,
+                    model.name,
-                    models_dir / model.name.lower(),
+                    model.name.lower(),
-                    models_dir / model.hf_repo.split("/")[-1],
+                    model.hf_repo.split("/")[-1],
-                    models_dir / model.hf_repo.replace("/", "--"),
+                    model.hf_repo.replace("/", "--"),
                ]
-                for path in possible_paths:
+                # Search recursively up to max_depth
-                    if path.exists() and (path / "config.json").exists():
+                for depth in range(max_depth):
-                        results.append((model, path))
+                    # Build glob pattern for current depth
-                        found = True
+                    # depth=0: direct children, depth=1: grandchildren, etc.
                    glob_pattern = "*" if depth > 0 else ""
                    for _ in range(depth):
                        glob_pattern = "*/" + glob_pattern if glob_pattern else "*"
                    for name in possible_names:
                        if depth == 0:
                            # Direct children: models_dir / name
                            search_paths = [models_dir / name]
                        else:
                            # Nested: use rglob to find directories matching the name
                            search_paths = list(models_dir.rglob(name))
                        for path in search_paths:
                            if path.exists() and (path / "config.json").exists():
                                results.append((model, path))
                                found = True
                                break
                        if found:
                            break
                    if found:
                        break
                if found: