Files
ktransformers/kt-kernel/python/cli/utils/run_interactive.py
Oql 56cbd69ac4 kt-cli enhancement (#1834)
* [feat]: redesign kt run interactive configuration with i18n support

- Redesign kt run with 8-step interactive flow (model selection, inference method, NUMA/CPU, GPU experts, KV cache, GPU/TP selection, parsers, host/port)
- Add configuration save/load system (~/.ktransformers/run_configs.yaml)
- Add i18n support for kt chat (en/zh translations)
- Add universal input validators with auto-retry and Chinese comma support
- Add port availability checker with auto-suggestion
- Add parser configuration (--tool-call-parser, --reasoning-parser)
- Remove tuna command and clean up redundant files
- Fix: variable reference bug in run.py, filter to show only MoE models

* [feat]: unify model selection UI and enable shared experts fusion by default

- Unify kt run model selection table with kt model list display
  * Add Total size, MoE Size, Repo, and SHA256 status columns
  * Use consistent formatting and styling
  * Improve user decision-making with more information

- Enable --disable-shared-experts-fusion by default
  * Change default value from False to True
  * Users can still override with --enable-shared-experts-fusion

* [feat]: improve kt chat with performance metrics and better CJK support

- Add performance metrics display after each response
  * Total time, TTFT (Time To First Token), TPOT (Time Per Output Token)
  * Accurate input/output token counts using model tokenizer
  * Fallback to estimation if tokenizer unavailable
  * Metrics shown in dim style (not prominent)

- Fix Chinese character input issues
  * Replace Prompt.ask() with console.input() for better CJK support
  * Fixes backspace deletion showing half-characters

- Suppress NumPy subnormal warnings
  * Filter "The value of the smallest subnormal" warnings
  * Cleaner CLI output on certain hardware environments

* [fix]: correct TTFT measurement in kt chat

- Move start_time initialization before API call
- Previously start_time was set when receiving first chunk, causing TTFT ≈ 0ms
- Now correctly measures time from request sent to first token received

* [docs]: 添加 Clawdbot 集成指南 - KTransformers 企业级 AI 助手部署方案

* [docs]: 强调推荐使用 Kimi K2.5 作为核心模型,突出企业级推理能力

* [docs]: 添加 Clawdbot 飞书接入教程链接

* [feat]: improve CLI table display, model verification, and chat experience

- Add sequence number (#) column to all model tables by default
- Filter kt edit to show only MoE GPU models (exclude AMX)
- Extend kt model verify to check *.json and *.py files in addition to weights
- Fix re-verification bug where repaired files caused false failures
- Suppress tokenizer debug output in kt chat token counting

* [fix]: fix cpu cores.

---------

Co-authored-by: skqliao <skqliao@gmail.com>
2026-02-04 16:44:54 +08:00

1085 lines
38 KiB
Python

"""
Interactive configuration for kt run command - New Implementation.
Provides step-by-step interactive configuration for running models.
"""
from typing import Optional, List, Dict, Any, Tuple
from pathlib import Path
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.prompt import Prompt, Confirm
from rich import box
import torch
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.input_validators import (
prompt_int_with_retry,
prompt_float_with_retry,
prompt_choice_with_retry,
prompt_int_list_with_retry,
)
console = Console()
def get_gpu_info() -> List[Dict[str, Any]]:
"""Get real-time GPU information with free VRAM."""
from kt_kernel.cli.utils.environment import detect_gpus
gpus = detect_gpus()
gpu_info_list = []
for i, gpu in enumerate(gpus):
total_vram_gb = gpu.vram_gb
free_vram_gb = gpu.vram_gb # Default fallback
# Try to get real-time free VRAM
if torch.cuda.is_available() and i < torch.cuda.device_count():
try:
free_vram_bytes, total_vram_bytes = torch.cuda.mem_get_info(i)
free_vram_gb = free_vram_bytes / (1024**3)
total_vram_gb = total_vram_bytes / (1024**3)
except Exception:
pass # Use fallback values
gpu_info_list.append(
{
"id": i,
"name": gpu.name,
"total_vram_gb": total_vram_gb,
"free_vram_gb": free_vram_gb,
}
)
return gpu_info_list
def select_model() -> Optional[Any]:
"""Step 1: Select a safetensors MoE model.
Returns:
Selected UserModel object or None if cancelled.
"""
from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
from kt_kernel.cli.commands.model import is_amx_weights
registry = UserModelRegistry()
all_models = registry.list_models()
# Filter: safetensors models only (exclude AMX and GGUF)
# Then filter to only show MoE models (matching kt model list behavior)
moe_models = []
for model in all_models:
if model.format == "safetensors" and model.path_exists():
is_amx, _ = is_amx_weights(model.path)
if not is_amx:
# Only include MoE models (is_moe == True)
# Also include models not yet analyzed (is_moe == None) for backwards compatibility
if model.is_moe is True or model.is_moe is None:
moe_models.append(model)
if not moe_models:
console.print(f"[yellow]{t('run_int_no_moe_models')}[/yellow]")
console.print(f" {t('run_int_add_models')}")
console.print(f" {t('run_int_list_all')}")
return None
console.print()
console.print(Panel(f"[bold cyan]{t('run_int_step1_title')}[/bold cyan]", expand=False))
console.print()
# Display models using same format as kt model list
from kt_kernel.cli.utils.model_scanner import format_size
from kt_kernel.cli.commands.model import SHA256_STATUS_MAP
table = Table(box=box.ROUNDED, show_header=True, header_style="bold cyan")
table.add_column("#", justify="right", style="cyan", no_wrap=True)
table.add_column("Name", style="cyan", no_wrap=True)
table.add_column("Path", style="dim", overflow="fold")
table.add_column("Total", justify="right")
table.add_column("Exps", justify="center", style="yellow")
table.add_column("Act", justify="center", style="green")
table.add_column("MoE Size", justify="right", style="cyan")
table.add_column("Repo", style="dim", overflow="fold")
table.add_column("SHA256", justify="center")
for i, model in enumerate(moe_models, 1):
# Calculate size
if model.path_exists():
path_obj = Path(model.path)
try:
files = list(path_obj.glob("*.safetensors"))
total_size = sum(f.stat().st_size for f in files if f.exists())
size_display = format_size(total_size)
except:
size_display = "[dim]-[/dim]"
else:
size_display = "[dim]-[/dim]"
# Format MoE info
experts = f"[yellow]{model.moe_num_experts}[/yellow]" if model.moe_num_experts else "[dim]-[/dim]"
active = f"[green]{model.moe_num_experts_per_tok}[/green]" if model.moe_num_experts_per_tok else "[dim]-[/dim]"
moe_size = f"[cyan]{size_display}[/cyan]" if model.moe_num_experts else "[dim]-[/dim]"
# Format repo info
if model.repo_id:
repo_abbr = "hf" if model.repo_type == "huggingface" else "ms"
repo_display = f"{repo_abbr}:{model.repo_id}"
else:
repo_display = "[dim]-[/dim]"
# Format SHA256 status
sha256_display = SHA256_STATUS_MAP.get(model.sha256_status, model.sha256_status)
table.add_row(
str(i),
model.name,
str(model.path),
size_display,
experts,
active,
moe_size,
repo_display,
sha256_display,
)
console.print(table)
console.print()
choice = prompt_int_with_retry(
t("run_int_select_model"),
default=1,
min_val=1,
max_val=len(moe_models),
)
return moe_models[choice - 1]
def select_inference_method(model: Any) -> Optional[Dict[str, Any]]:
"""Step 2: Select inference method.
Args:
model: Selected UserModel
Returns:
Dict with 'method' (raw/amx/gguf/saved), and method-specific fields, or None if cancelled.
"""
from kt_kernel.cli.utils.run_configs import RunConfigManager
config_manager = RunConfigManager()
saved_configs = config_manager.list_configs(model.id)
# Debug output (can be removed later)
if False: # Set to True for debugging
console.print()
console.print(f"[dim]DEBUG: Model ID: {model.id}[/dim]")
console.print(f"[dim]DEBUG: Saved configs count: {len(saved_configs)}[/dim]")
if saved_configs:
console.print(f"[dim]DEBUG: Configs: {[c.get('config_name', '?') for c in saved_configs]}[/dim]")
console.print()
console.print()
console.print(Panel("[bold cyan]Step 2: Select Inference Method[/bold cyan]", expand=False))
console.print()
options = []
option_map = {}
# Option 1: Use saved configuration (if any)
if saved_configs:
option_idx = len(options) + 1
console.print(f" [cyan][{option_idx}][/cyan] [bold]Use Saved Configuration[/bold]")
console.print(f" [dim]{len(saved_configs)} saved config(s) available[/dim]")
options.append(str(option_idx))
option_map[str(option_idx)] = "saved"
# Option 2: Raw precision inference
option_idx = len(options) + 1
console.print(f" [cyan][{option_idx}][/cyan] [bold]Raw Precision Inference[/bold]")
console.print(" [dim]FP8 / FP8_PERCHANNEL / BF16 / RAWINT4[/dim]")
options.append(str(option_idx))
option_map[str(option_idx)] = "raw"
# Option 3: AMX quantized inference
option_idx = len(options) + 1
console.print(f" [cyan][{option_idx}][/cyan] [bold]AMX Quantized Inference[/bold]")
console.print(" [dim]INT4 / INT8 (CPU optimized)[/dim]")
options.append(str(option_idx))
option_map[str(option_idx)] = "amx"
# Option 4: GGUF inference
option_idx = len(options) + 1
console.print(f" [cyan][{option_idx}][/cyan] [bold]GGUF Inference[/bold]")
console.print(" [dim]Llamafile format[/dim]")
options.append(str(option_idx))
option_map[str(option_idx)] = "gguf"
console.print()
choice = prompt_choice_with_retry("Select method", choices=options, default="1")
method = option_map[choice]
if method == "saved":
return _select_saved_config(model, saved_configs)
elif method == "raw":
return _configure_raw_inference(model)
elif method == "amx":
return _configure_amx_inference(model)
elif method == "gguf":
return _configure_gguf_inference(model)
return None
def _select_saved_config(model: Any, saved_configs: List[Dict]) -> Optional[Dict[str, Any]]:
"""Select from saved configurations with detailed display."""
console.print()
console.print("[bold]Saved Configurations:[/bold]")
console.print()
for i, cfg in enumerate(saved_configs, 1):
# Build method display
method_display = cfg.get("inference_method", "unknown").upper()
kt_method = cfg.get("kt_method", "unknown")
if cfg.get("inference_method") == "raw":
raw_method = cfg.get("raw_method", "unknown")
method_display = f"{raw_method}"
elif cfg.get("inference_method") == "amx":
method_display = kt_method
elif cfg.get("inference_method") == "gguf":
method_display = "LLAMAFILE"
else:
method_display = kt_method
# Display config header
console.print(f" [cyan][{i}][/cyan] [bold]{cfg.get('config_name', f'Config {i}')}[/bold]")
console.print()
# Display detailed parameters
console.print(f" [yellow]KT Method:[/yellow] {method_display}")
console.print(f" [yellow]NUMA Nodes:[/yellow] {cfg.get('numa_nodes', '?')}")
console.print(f" [yellow]CPU Threads:[/yellow] {cfg.get('cpu_threads', '?')}")
console.print(f" [yellow]GPU Experts:[/yellow] {cfg.get('gpu_experts', '?')}")
console.print(f" [yellow]TP Size:[/yellow] {cfg.get('tp_size', '?')}")
console.print(f" [yellow]Memory Fraction:[/yellow] {cfg.get('mem_fraction_static', '?')}")
console.print(f" [yellow]Server:[/yellow] {cfg.get('host', '0.0.0.0')}:{cfg.get('port', 30000)}")
# Display KV cache info if present
if cfg.get("kv_cache"):
console.print(f" [yellow]KV Cache:[/yellow] {cfg.get('kv_cache', '?')}")
console.print(f" [yellow]Chunk Prefill:[/yellow] {cfg.get('chunk_prefill', '?')}")
console.print(f" [yellow]GPU Prefill Thr:[/yellow] {cfg.get('gpu_prefill_threshold', '?')}")
# Display parser info if present
if cfg.get("tool_call_parser") or cfg.get("reasoning_parser"):
if cfg.get("tool_call_parser"):
console.print(f" [yellow]Tool Call Parser:[/yellow] {cfg.get('tool_call_parser')}")
if cfg.get("reasoning_parser"):
console.print(f" [yellow]Reasoning Parser:[/yellow] {cfg.get('reasoning_parser')}")
console.print()
# Build and display command preview
cmd_preview = _build_command_preview(model, cfg)
console.print(" [dim]Command:[/dim]")
console.print()
for line in cmd_preview:
console.print(f" {line}")
console.print()
choice = prompt_int_with_retry(
"Select configuration",
default=1,
min_val=1,
max_val=len(saved_configs),
)
selected_config = saved_configs[choice - 1].copy()
selected_config["method"] = "saved"
return selected_config
def _build_command_preview(model: Any, cfg: Dict[str, Any]) -> List[str]:
"""Build command preview for saved configuration.
Args:
model: UserModel object
cfg: Saved configuration dict
Returns:
List of command lines for display
"""
import sys
host = cfg.get("host", "0.0.0.0")
port = cfg.get("port", 30000)
lines = [
"python -m sglang.launch_server \\",
f" --host {host} \\",
f" --port {port} \\",
f" --model {cfg.get('model_path', '?')} \\",
f" --kt-weight-path {cfg.get('weights_path', '?')} \\",
f" --kt-cpuinfer {cfg.get('cpu_threads', '?')} \\",
f" --kt-threadpool-count {cfg.get('numa_nodes', '?')} \\",
f" --kt-num-gpu-experts {cfg.get('gpu_experts', '?')} \\",
f" --kt-method {cfg.get('kt_method', '?')} \\",
]
# Add GPU prefill threshold (use saved value or default)
gpu_prefill = cfg.get("gpu_prefill_threshold", 500)
lines.append(f" --kt-gpu-prefill-token-threshold {gpu_prefill} \\")
lines.append(" --kt-enable-dynamic-expert-update \\")
# Add attention backend
lines.append(" --attention-backend flashinfer \\")
lines.append(" --trust-remote-code \\")
# Add memory and performance settings
lines.append(f" --mem-fraction-static {cfg.get('mem_fraction_static', 0.9)} \\")
# Add KV cache settings
chunk_prefill = cfg.get("chunk_prefill", 32768)
max_tokens = cfg.get("kv_cache", 32768)
lines.append(f" --chunked-prefill-size {chunk_prefill} \\")
lines.append(f" --max-total-tokens {max_tokens} \\")
lines.append(" --max-running-requests 4 \\")
lines.append(" --watchdog-timeout 3000 \\")
lines.append(" --enable-mixed-chunk \\")
# Add TP size (will be updated with actual GPU selection)
lines.append(f" --tensor-parallel-size {cfg.get('tp_size', '?')} \\")
lines.append(" --enable-p2p-check \\")
# Add FP8 backend if using FP8
kt_method = cfg.get("kt_method", "")
if "FP8" in kt_method.upper():
lines.append(" --fp8-gemm-backend triton \\")
# Add parsers if configured
if cfg.get("tool_call_parser"):
lines.append(f" --tool-call-parser {cfg['tool_call_parser']} \\")
if cfg.get("reasoning_parser"):
lines.append(f" --reasoning-parser {cfg['reasoning_parser']} \\")
# Remove trailing backslash from last line
if lines:
lines[-1] = lines[-1].rstrip(" \\")
return lines
def _configure_raw_inference(model: Any) -> Dict[str, Any]:
"""Configure raw precision inference."""
console.print()
console.print("[bold]Select Raw Precision Type:[/bold]")
console.print()
console.print(" [cyan][1][/cyan] FP8")
console.print(" [cyan][2][/cyan] FP8_PERCHANNEL")
console.print(" [cyan][3][/cyan] BF16")
console.print(" [cyan][4][/cyan] RAWINT4")
console.print()
choice = prompt_choice_with_retry("Select precision", choices=["1", "2", "3", "4"], default="1")
precision_map = {
"1": "FP8",
"2": "FP8_PERCHANNEL",
"3": "BF16",
"4": "RAWINT4",
}
raw_method = precision_map[choice]
return {
"method": "raw",
"raw_method": raw_method,
"kt_method": raw_method,
"model_path": model.path,
"weights_path": model.path, # Same as model path for raw
}
def _configure_amx_inference(model: Any) -> Optional[Dict[str, Any]]:
"""Configure AMX quantized inference."""
from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
from kt_kernel.cli.commands.model import is_amx_weights
registry = UserModelRegistry()
all_models = registry.list_models()
# Filter AMX models
amx_models = []
for m in all_models:
if m.format == "safetensors":
is_amx, numa = is_amx_weights(m.path)
if is_amx:
# Check if it's derived from the selected model
if m.amx_source_model == model.name:
amx_models.insert(0, m) # Prioritize matched models
else:
amx_models.append(m)
if not amx_models:
console.print("[yellow]No AMX quantized models found.[/yellow]")
console.print(" Quantize your model with: [cyan]kt quant[/cyan]")
return None
console.print()
console.print("[bold]Select AMX Weights:[/bold]")
console.print()
for i, m in enumerate(amx_models, 1):
is_amx, numa = is_amx_weights(m.path)
method_str = m.amx_quant_method.upper() if m.amx_quant_method else "Unknown"
match_indicator = "[green]★[/green]" if m.amx_source_model == model.name else " "
console.print(f" {match_indicator} [cyan][{i}][/cyan] {m.name}")
console.print(
f" [dim]Method: AMX{method_str}, NUMA: {numa}, Source: {m.amx_source_model or 'Unknown'}[/dim]"
)
console.print()
choice = prompt_int_with_retry(
"Select AMX weights",
default=1,
min_val=1,
max_val=len(amx_models),
)
selected_amx = amx_models[choice - 1]
is_amx, numa = is_amx_weights(selected_amx.path)
kt_method = f"AMX{selected_amx.amx_quant_method.upper()}" if selected_amx.amx_quant_method else "AMXINT4"
return {
"method": "amx",
"kt_method": kt_method,
"model_path": model.path,
"weights_path": selected_amx.path,
"amx_numa_nodes": numa,
}
def _configure_gguf_inference(model: Any) -> Optional[Dict[str, Any]]:
"""Configure GGUF inference."""
from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
registry = UserModelRegistry()
all_models = registry.list_models()
# Filter GGUF models
gguf_models = [m for m in all_models if m.format == "gguf"]
if not gguf_models:
console.print("[yellow]No GGUF models found.[/yellow]")
console.print(" Add GGUF models with: [cyan]kt model add /path/to/model.gguf[/cyan]")
return None
console.print()
console.print("[bold]Select GGUF Weights:[/bold]")
console.print()
for i, m in enumerate(gguf_models, 1):
console.print(f" [cyan][{i}][/cyan] {m.name}")
console.print(f" [dim]Path: {m.path}[/dim]")
console.print()
choice = prompt_int_with_retry(
"Select GGUF weights",
default=1,
min_val=1,
max_val=len(gguf_models),
)
selected_gguf = gguf_models[choice - 1]
return {
"method": "gguf",
"kt_method": "LLAMAFILE",
"model_path": model.path,
"weights_path": selected_gguf.path,
}
def configure_numa_and_cpu(method_config: Dict[str, Any]) -> Dict[str, int]:
"""Step 3: Configure NUMA and CPU threads.
Args:
method_config: Config from step 2 (may contain amx_numa_nodes hint)
Returns:
Dict with 'numa_nodes' and 'cpu_threads'
"""
from kt_kernel.cli.utils.environment import detect_cpu_info
cpu_info = detect_cpu_info()
max_numa = cpu_info.numa_nodes
max_cores = cpu_info.threads # Use logical threads instead of physical cores
console.print()
console.print(Panel("[bold cyan]Step 3: NUMA and CPU Configuration[/bold cyan]", expand=False))
console.print()
# Show AMX hint if applicable
if method_config.get("method") == "amx" and method_config.get("amx_numa_nodes"):
amx_numa = method_config["amx_numa_nodes"]
console.print(f"[yellow]⚠ Note: This AMX model was quantized with NUMA={amx_numa}[/yellow]")
console.print(f"[yellow] For optimal performance, use the same NUMA setting.[/yellow]")
console.print()
default_numa = amx_numa
else:
default_numa = max_numa
numa_nodes = prompt_int_with_retry(
f"NUMA Nodes (1 to {max_numa})",
default=default_numa,
min_val=1,
max_val=max_numa,
)
default_threads = int(max_cores * 0.8)
cpu_threads = prompt_int_with_retry(
f"CPU Threads (1 to {max_cores})",
default=default_threads,
min_val=1,
max_val=max_cores,
)
return {
"numa_nodes": numa_nodes,
"cpu_threads": cpu_threads,
}
def configure_gpu_experts(model: Any) -> int:
"""Step 4: Configure GPU expert count.
Args:
model: Selected model
Returns:
Number of GPU experts
"""
from kt_kernel.cli.utils.analyze_moe_model import analyze_moe_model
console.print()
console.print(Panel("[bold cyan]Step 4: GPU Experts Configuration[/bold cyan]", expand=False))
console.print()
# Try to get num_experts from model
try:
moe_result = analyze_moe_model(model.path)
num_experts = moe_result.get("num_experts", 256)
except Exception:
num_experts = 256 # Default fallback
console.print(f"[dim]Model has {num_experts} experts total[/dim]")
console.print()
console.print("[yellow]⚠ Tip: More GPU experts = faster inference, but uses more VRAM[/yellow]")
console.print()
default_experts = min(8, num_experts)
gpu_experts = prompt_int_with_retry(
f"GPU Experts per layer (0 to {num_experts})",
default=default_experts,
min_val=0,
max_val=num_experts,
)
return gpu_experts
def configure_kv_cache(is_raw_inference: bool) -> Optional[Dict[str, int]]:
"""Step 5: Configure KV Cache (only for raw inference).
Args:
is_raw_inference: True if using raw precision inference
Returns:
Dict with 'kv_cache', 'chunk_prefill', 'gpu_prefill_threshold' or None if not applicable
"""
if not is_raw_inference:
return None
console.print()
console.print(Panel("[bold cyan]Step 5: KV Cache and Prefill Configuration[/bold cyan]", expand=False))
console.print()
console.print("[dim]These settings control memory allocation and prefill batch size[/dim]")
console.print("[dim]gpu-prefill-token-threshold: maximum length for single layerwise prefill[/dim]")
console.print()
kv_cache = prompt_int_with_retry("KV Cache Size (max_total_tokens)", default=32768, min_val=1)
chunk_prefill = prompt_int_with_retry("Chunk Prefill Size", default=32768, min_val=1)
gpu_prefill_threshold = prompt_int_with_retry("GPU Prefill Token Threshold", default=500, min_val=1)
return {
"kv_cache": kv_cache,
"chunk_prefill": chunk_prefill,
"gpu_prefill_threshold": gpu_prefill_threshold,
}
def select_gpus_and_tp(
required_tp_size: Optional[int] = None, saved_mem_fraction: Optional[float] = None
) -> Tuple[List[int], int, float]:
"""Step 6: Select GPUs, TP size, and memory fraction.
Args:
required_tp_size: If specified, user must select exactly this many GPUs.
If None, TP size can be any power of 2.
saved_mem_fraction: If specified, use this memory fraction instead of prompting.
Used when loading saved configurations.
Returns:
Tuple of (selected_gpu_ids, tp_size, mem_fraction_static)
"""
gpu_info_list = get_gpu_info()
if not gpu_info_list:
console.print("[red]No GPUs detected[/red]")
return [], 0, 0.9
console.print()
if required_tp_size is not None:
console.print(Panel(f"[bold cyan]Select {required_tp_size} GPUs (for saved config)[/bold cyan]", expand=False))
console.print()
console.print(f"[yellow]Required TP size: {required_tp_size}[/yellow]")
console.print(f"[yellow]You must select exactly {required_tp_size} GPU(s)[/yellow]")
else:
console.print(Panel("[bold cyan]Step 6: GPU Selection and Memory[/bold cyan]", expand=False))
console.print()
console.print("[dim]TP (Tensor Parallel) size must be a power of 2: 1, 2, 4, 8, ...[/dim]")
console.print()
# Display GPUs
table = Table(box=box.ROUNDED, show_header=True, header_style="bold cyan")
table.add_column("ID", justify="right", style="cyan")
table.add_column("Name", style="white")
table.add_column("Free VRAM", justify="right", style="green")
table.add_column("Total VRAM", justify="right", style="dim")
for gpu in gpu_info_list:
table.add_row(str(gpu["id"]), gpu["name"], f"{gpu['free_vram_gb']:.1f} GB", f"{gpu['total_vram_gb']:.1f} GB")
console.print(table)
console.print()
# Validator function
def validate_tp_requirements(gpu_ids: List[int]) -> tuple[bool, Optional[str]]:
"""Validate TP requirements based on required_tp_size."""
actual_count = len(gpu_ids)
if required_tp_size is not None:
# Exact count required
if actual_count != required_tp_size:
return False, f"Must select exactly {required_tp_size} GPU(s), but you selected {actual_count}."
else:
# Must be power of 2
if actual_count & (actual_count - 1) != 0:
return (
False,
f"TP size ({actual_count}) must be a power of 2. Valid sizes: 1, 2, 4, 8, 16, 32, ...\nYou selected {actual_count} GPU(s). Please select a different number.",
)
return True, None
# Generate default GPU selection
if required_tp_size is not None:
# For saved config: select first N GPUs
if required_tp_size <= len(gpu_info_list):
default_gpus = ",".join(str(i) for i in range(required_tp_size))
else:
default_gpus = ",".join(str(i) for i in range(len(gpu_info_list)))
prompt_text = f"Enter {required_tp_size} GPU ID(s) separated by commas (e.g., 0,1,2,3)"
else:
# For new config: select all GPUs
default_gpus = ",".join(str(i) for i in range(len(gpu_info_list)))
prompt_text = "Enter GPU IDs separated by commas (e.g., 0,1,2,3)"
console.print(prompt_text)
console.print(f" Or press Enter to use all {len(gpu_info_list)} GPUs")
console.print()
selected_gpu_ids = prompt_int_list_with_retry(
"GPU IDs",
default=default_gpus,
min_val=0,
max_val=len(gpu_info_list) - 1,
validator=validate_tp_requirements,
)
tp_size = len(selected_gpu_ids)
console.print()
console.print(f"[green]✓[/green] Selected {tp_size} GPU(s): {selected_gpu_ids}")
console.print()
# Memory fraction - use saved value if provided, otherwise prompt
if saved_mem_fraction is not None:
mem_fraction = saved_mem_fraction
console.print(f"[dim]Using saved memory fraction: {mem_fraction}[/dim]")
else:
mem_fraction = prompt_float_with_retry(
"Static Memory Fraction (0.0-1.0)",
default=0.9,
min_val=0.0,
max_val=1.0,
)
return selected_gpu_ids, tp_size, mem_fraction
def configure_parsers() -> Dict[str, Optional[str]]:
"""Step 7: Configure parsers (optional).
Returns:
Dict with 'tool_call_parser' and 'reasoning_parser' (can be None)
"""
console.print()
console.print(Panel("[bold cyan]Step 7: Parser Configuration (Optional)[/bold cyan]", expand=False))
console.print()
console.print("[dim]Press Enter to skip (no parser will be added)[/dim]")
console.print()
tool_call_parser = Prompt.ask("Tool Call Parser (e.g., glm47)", default="")
tool_call_parser = tool_call_parser.strip() if tool_call_parser else None
reasoning_parser = Prompt.ask("Reasoning Parser (e.g., glm45)", default="")
reasoning_parser = reasoning_parser.strip() if reasoning_parser else None
if tool_call_parser or reasoning_parser:
console.print()
if tool_call_parser:
console.print(f"[green]✓[/green] Tool Call Parser: {tool_call_parser}")
if reasoning_parser:
console.print(f"[green]✓[/green] Reasoning Parser: {reasoning_parser}")
else:
console.print()
console.print("[dim]No parsers configured[/dim]")
return {
"tool_call_parser": tool_call_parser,
"reasoning_parser": reasoning_parser,
}
def configure_host_and_port() -> Dict[str, Any]:
"""Step 8: Configure host and port with availability check.
Returns:
Dict with 'host' and 'port'
"""
from kt_kernel.cli.utils.port_checker import is_port_available
console.print()
console.print(Panel("[bold cyan]Step 8: Server Configuration[/bold cyan]", expand=False))
console.print()
# Get host
host = Prompt.ask("Server Host", default="0.0.0.0")
# Get port with availability check
while True:
port = prompt_int_with_retry(
"Server Port",
default=30000,
min_val=1024,
max_val=65535,
)
# Check if port is available
console.print()
console.print(f"[dim]Checking port {port} availability...[/dim]")
if is_port_available(host, port):
console.print(f"[green]✓[/green] Port {port} is available")
break
else:
console.print(f"[red]✗[/red] Port {port} is already in use")
console.print()
# Suggest next available port
from kt_kernel.cli.utils.port_checker import find_available_port
found, suggested_port = find_available_port(host, port + 1, max_attempts=100)
if found:
console.print(f"[yellow]Suggestion:[/yellow] Port {suggested_port} is available")
console.print()
console.print()
console.print(f"[green]✓[/green] Server will listen on {host}:{port}")
return {
"host": host,
"port": port,
}
def save_config_prompt(model: Any, full_config: Dict[str, Any]) -> bool:
"""Step 7: Prompt to save configuration.
Args:
model: Selected model
full_config: Complete configuration dict
Returns:
True if saved, False otherwise
"""
console.print()
console.print(Panel("[bold cyan]Step 7: Save Configuration[/bold cyan]", expand=False))
console.print()
if not Confirm.ask("Save this configuration for future use?", default=True):
return False
config_name = Prompt.ask("Configuration name", default=f"Config {full_config.get('inference_method', 'default')}")
from kt_kernel.cli.utils.run_configs import RunConfigManager
config_manager = RunConfigManager()
# Prepare config to save (exclude runtime-only fields and non-serializable objects)
save_config = {
"config_name": config_name,
"inference_method": full_config["inference_method"],
"kt_method": full_config["kt_method"],
"model_path": str(full_config["model_path"]),
"weights_path": str(full_config["weights_path"]),
"numa_nodes": full_config["numa_nodes"],
"cpu_threads": full_config["cpu_threads"],
"gpu_experts": full_config["gpu_experts"],
"tp_size": full_config["tp_size"],
"mem_fraction_static": full_config["mem_fraction_static"],
"host": full_config["host"],
"port": full_config["port"],
# Note: selected_gpus is NOT saved - user will select GPUs when loading config
}
# Add parser config if present
if full_config.get("tool_call_parser"):
save_config["tool_call_parser"] = full_config["tool_call_parser"]
if full_config.get("reasoning_parser"):
save_config["reasoning_parser"] = full_config["reasoning_parser"]
# Add raw-specific config if present
if full_config.get("raw_method"):
save_config["raw_method"] = full_config["raw_method"]
if full_config.get("kv_cache"):
save_config["kv_cache"] = full_config["kv_cache"]
save_config["chunk_prefill"] = full_config["chunk_prefill"]
save_config["gpu_prefill_threshold"] = full_config["gpu_prefill_threshold"]
config_manager.save_config(model.id, save_config)
console.print()
console.print(f"[green]✓[/green] Configuration saved: {config_name}")
return True
def interactive_run_config() -> Optional[Dict[str, Any]]:
"""
Main interactive configuration flow for kt run.
Returns:
Complete configuration dict or None if cancelled.
"""
# Step 1: Select model
model = select_model()
if not model:
return None
# Step 2: Select inference method
method_config = select_inference_method(model)
if not method_config:
return None
# If using saved config, add model object and return directly
if method_config.get("method") == "saved":
console.print()
console.print("[green]✓[/green] Using saved configuration")
# Let user select GPUs (must match saved TP size)
saved_tp_size = method_config.get("tp_size", 1)
console.print()
console.print(f"[yellow]This configuration requires TP={saved_tp_size}[/yellow]")
console.print(f"[yellow]Please select {saved_tp_size} GPU(s)[/yellow]")
# Get saved memory fraction
saved_mem_fraction = method_config.get("mem_fraction_static", 0.9)
selected_gpus, actual_tp_size, _ = select_gpus_and_tp(
required_tp_size=saved_tp_size, saved_mem_fraction=saved_mem_fraction
)
if not selected_gpus:
return None
# Update config with selected GPUs (keep saved mem_fraction_static)
method_config["selected_gpus"] = selected_gpus
# tp_size is already in method_config from saved data
# Check port availability
from kt_kernel.cli.utils.port_checker import is_port_available, find_available_port
saved_host = method_config.get("host", "0.0.0.0")
saved_port = method_config.get("port", 30000)
console.print()
console.print(f"[dim]Checking port {saved_port} availability...[/dim]")
if is_port_available(saved_host, saved_port):
console.print(f"[green]✓[/green] Port {saved_port} is available")
method_config["port"] = saved_port
method_config["host"] = saved_host
else:
console.print(f"[red]✗[/red] Port {saved_port} is already in use")
console.print()
# Suggest next available port
found, suggested_port = find_available_port(saved_host, saved_port + 1, max_attempts=100)
if found:
console.print(f"[yellow]Suggestion:[/yellow] Port {suggested_port} is available")
console.print()
# Ask user for new port
while True:
new_port = prompt_int_with_retry(
"Enter new port",
default=suggested_port if found else saved_port + 1,
min_val=1024,
max_val=65535,
)
console.print()
console.print(f"[dim]Checking port {new_port} availability...[/dim]")
if is_port_available(saved_host, new_port):
console.print(f"[green]✓[/green] Port {new_port} is available")
method_config["port"] = new_port
method_config["host"] = saved_host
break
else:
console.print(f"[red]✗[/red] Port {new_port} is already in use")
console.print()
# Add model object for run.py compatibility
method_config["model"] = model
# Ensure paths are Path objects
from pathlib import Path
if "model_path" in method_config:
method_config["model_path"] = Path(method_config["model_path"])
if "weights_path" in method_config:
method_config["weights_path"] = Path(method_config["weights_path"])
# Display configuration summary
console.print()
console.print(Panel("[bold cyan]Saved Configuration[/bold cyan]", expand=False))
console.print()
_display_config_summary(method_config)
console.print()
# Start directly without confirmation when using saved config
return method_config
# Step 3: Configure NUMA and CPU
numa_cpu_config = configure_numa_and_cpu(method_config)
# Step 4: Configure GPU experts
gpu_experts = configure_gpu_experts(model)
# Step 5: Configure KV Cache (only for raw)
is_raw = method_config.get("method") == "raw"
kv_config = configure_kv_cache(is_raw)
# Step 6: Select GPUs and TP
selected_gpus, tp_size, mem_fraction = select_gpus_and_tp()
if not selected_gpus:
return None
# Step 7: Configure parsers (optional)
parser_config = configure_parsers()
# Step 8: Configure host and port
server_config = configure_host_and_port()
# Build complete configuration
full_config = {
"model": model,
"inference_method": method_config["method"],
"kt_method": method_config["kt_method"],
"model_path": method_config["model_path"],
"weights_path": method_config["weights_path"],
**numa_cpu_config,
"gpu_experts": gpu_experts,
"selected_gpus": selected_gpus,
"tp_size": tp_size,
"mem_fraction_static": mem_fraction,
**parser_config, # Add parser config
**server_config, # Add server config (host, port)
}
# Add raw-specific config
if kv_config:
full_config["raw_method"] = method_config.get("raw_method")
full_config.update(kv_config)
# Step 9: Save configuration
save_config_prompt(model, full_config)
# Final confirmation
console.print()
console.print(Panel("[bold cyan]Configuration Complete[/bold cyan]", expand=False))
console.print()
_display_config_summary(full_config)
console.print()
if not Confirm.ask("[bold green]Start model server with this configuration?[/bold green]", default=True):
console.print("[yellow]Cancelled[/yellow]")
return None
return full_config
def _display_config_summary(config: Dict[str, Any]):
"""Display configuration summary."""
model = config["model"]
console.print(f" Model: {model.name}")
console.print(f" KT Method: {config['kt_method']}")
console.print(f" NUMA Nodes: {config['numa_nodes']}")
console.print(f" CPU Threads: {config['cpu_threads']}")
console.print(f" GPU Experts: {config['gpu_experts']}")
# Handle both new config and saved config format
tp_size = config.get("tp_size", len(config.get("selected_gpus", [])))
selected_gpus = config.get("selected_gpus", [])
console.print(f" GPUs: {selected_gpus} (TP={tp_size})")
console.print(f" Memory Fraction: {config['mem_fraction_static']}")
# Server config
host = config.get("host", "0.0.0.0")
port = config.get("port", 30000)
console.print(f" Server: {host}:{port}")
if config.get("kv_cache"):
console.print(f" KV Cache: {config['kv_cache']}")
console.print(f" Chunk Prefill: {config['chunk_prefill']}")
console.print(f" GPU Prefill Thr: {config['gpu_prefill_threshold']}")
# Display parsers if configured
if config.get("tool_call_parser") or config.get("reasoning_parser"):
console.print()
if config.get("tool_call_parser"):
console.print(f" Tool Call Parser: {config['tool_call_parser']}")
if config.get("reasoning_parser"):
console.print(f" Reasoning Parser: {config['reasoning_parser']}")