""" Interactive configuration for kt run command - New Implementation. Provides step-by-step interactive configuration for running models. """ from typing import Optional, List, Dict, Any, Tuple from pathlib import Path from rich.console import Console from rich.table import Table from rich.panel import Panel from rich.prompt import Prompt, Confirm from rich import box import torch from kt_kernel.cli.i18n import t from kt_kernel.cli.utils.input_validators import ( prompt_int_with_retry, prompt_float_with_retry, prompt_choice_with_retry, prompt_int_list_with_retry, ) console = Console() def get_gpu_info() -> List[Dict[str, Any]]: """Get real-time GPU information with free VRAM.""" from kt_kernel.cli.utils.environment import detect_gpus gpus = detect_gpus() gpu_info_list = [] for i, gpu in enumerate(gpus): total_vram_gb = gpu.vram_gb free_vram_gb = gpu.vram_gb # Default fallback # Try to get real-time free VRAM if torch.cuda.is_available() and i < torch.cuda.device_count(): try: free_vram_bytes, total_vram_bytes = torch.cuda.mem_get_info(i) free_vram_gb = free_vram_bytes / (1024**3) total_vram_gb = total_vram_bytes / (1024**3) except Exception: pass # Use fallback values gpu_info_list.append( { "id": i, "name": gpu.name, "total_vram_gb": total_vram_gb, "free_vram_gb": free_vram_gb, } ) return gpu_info_list def select_model() -> Optional[Any]: """Step 1: Select a safetensors MoE model. Returns: Selected UserModel object or None if cancelled. """ from kt_kernel.cli.utils.user_model_registry import UserModelRegistry from kt_kernel.cli.commands.model import is_amx_weights registry = UserModelRegistry() all_models = registry.list_models() # Filter: safetensors models only (exclude AMX and GGUF) # Then filter to only show MoE models (matching kt model list behavior) moe_models = [] for model in all_models: if model.format == "safetensors" and model.path_exists(): is_amx, _ = is_amx_weights(model.path) if not is_amx: # Only include MoE models (is_moe == True) # Also include models not yet analyzed (is_moe == None) for backwards compatibility if model.is_moe is True or model.is_moe is None: moe_models.append(model) if not moe_models: console.print(f"[yellow]{t('run_int_no_moe_models')}[/yellow]") console.print(f" {t('run_int_add_models')}") console.print(f" {t('run_int_list_all')}") return None console.print() console.print(Panel(f"[bold cyan]{t('run_int_step1_title')}[/bold cyan]", expand=False)) console.print() # Display models using same format as kt model list from kt_kernel.cli.utils.model_scanner import format_size from kt_kernel.cli.commands.model import SHA256_STATUS_MAP table = Table(box=box.ROUNDED, show_header=True, header_style="bold cyan") table.add_column("#", justify="right", style="cyan", no_wrap=True) table.add_column("Name", style="cyan", no_wrap=True) table.add_column("Path", style="dim", overflow="fold") table.add_column("Total", justify="right") table.add_column("Exps", justify="center", style="yellow") table.add_column("Act", justify="center", style="green") table.add_column("MoE Size", justify="right", style="cyan") table.add_column("Repo", style="dim", overflow="fold") table.add_column("SHA256", justify="center") for i, model in enumerate(moe_models, 1): # Calculate size if model.path_exists(): path_obj = Path(model.path) try: files = list(path_obj.glob("*.safetensors")) total_size = sum(f.stat().st_size for f in files if f.exists()) size_display = format_size(total_size) except: size_display = "[dim]-[/dim]" else: size_display = "[dim]-[/dim]" # Format MoE info experts = f"[yellow]{model.moe_num_experts}[/yellow]" if model.moe_num_experts else "[dim]-[/dim]" active = f"[green]{model.moe_num_experts_per_tok}[/green]" if model.moe_num_experts_per_tok else "[dim]-[/dim]" moe_size = f"[cyan]{size_display}[/cyan]" if model.moe_num_experts else "[dim]-[/dim]" # Format repo info if model.repo_id: repo_abbr = "hf" if model.repo_type == "huggingface" else "ms" repo_display = f"{repo_abbr}:{model.repo_id}" else: repo_display = "[dim]-[/dim]" # Format SHA256 status sha256_display = SHA256_STATUS_MAP.get(model.sha256_status, model.sha256_status) table.add_row( str(i), model.name, str(model.path), size_display, experts, active, moe_size, repo_display, sha256_display, ) console.print(table) console.print() choice = prompt_int_with_retry( t("run_int_select_model"), default=1, min_val=1, max_val=len(moe_models), ) return moe_models[choice - 1] def select_inference_method(model: Any) -> Optional[Dict[str, Any]]: """Step 2: Select inference method. Args: model: Selected UserModel Returns: Dict with 'method' (raw/amx/gguf/saved), and method-specific fields, or None if cancelled. """ from kt_kernel.cli.utils.run_configs import RunConfigManager config_manager = RunConfigManager() saved_configs = config_manager.list_configs(model.id) # Debug output (can be removed later) if False: # Set to True for debugging console.print() console.print(f"[dim]DEBUG: Model ID: {model.id}[/dim]") console.print(f"[dim]DEBUG: Saved configs count: {len(saved_configs)}[/dim]") if saved_configs: console.print(f"[dim]DEBUG: Configs: {[c.get('config_name', '?') for c in saved_configs]}[/dim]") console.print() console.print() console.print(Panel("[bold cyan]Step 2: Select Inference Method[/bold cyan]", expand=False)) console.print() options = [] option_map = {} # Option 1: Use saved configuration (if any) if saved_configs: option_idx = len(options) + 1 console.print(f" [cyan][{option_idx}][/cyan] [bold]Use Saved Configuration[/bold]") console.print(f" [dim]{len(saved_configs)} saved config(s) available[/dim]") options.append(str(option_idx)) option_map[str(option_idx)] = "saved" # Option 2: Raw precision inference option_idx = len(options) + 1 console.print(f" [cyan][{option_idx}][/cyan] [bold]Raw Precision Inference[/bold]") console.print(" [dim]FP8 / FP8_PERCHANNEL / BF16 / RAWINT4[/dim]") options.append(str(option_idx)) option_map[str(option_idx)] = "raw" # Option 3: AMX quantized inference option_idx = len(options) + 1 console.print(f" [cyan][{option_idx}][/cyan] [bold]AMX Quantized Inference[/bold]") console.print(" [dim]INT4 / INT8 (CPU optimized)[/dim]") options.append(str(option_idx)) option_map[str(option_idx)] = "amx" # Option 4: GGUF inference option_idx = len(options) + 1 console.print(f" [cyan][{option_idx}][/cyan] [bold]GGUF Inference[/bold]") console.print(" [dim]Llamafile format[/dim]") options.append(str(option_idx)) option_map[str(option_idx)] = "gguf" console.print() choice = prompt_choice_with_retry("Select method", choices=options, default="1") method = option_map[choice] if method == "saved": return _select_saved_config(model, saved_configs) elif method == "raw": return _configure_raw_inference(model) elif method == "amx": return _configure_amx_inference(model) elif method == "gguf": return _configure_gguf_inference(model) return None def _select_saved_config(model: Any, saved_configs: List[Dict]) -> Optional[Dict[str, Any]]: """Select from saved configurations with detailed display.""" console.print() console.print("[bold]Saved Configurations:[/bold]") console.print() for i, cfg in enumerate(saved_configs, 1): # Build method display method_display = cfg.get("inference_method", "unknown").upper() kt_method = cfg.get("kt_method", "unknown") if cfg.get("inference_method") == "raw": raw_method = cfg.get("raw_method", "unknown") method_display = f"{raw_method}" elif cfg.get("inference_method") == "amx": method_display = kt_method elif cfg.get("inference_method") == "gguf": method_display = "LLAMAFILE" else: method_display = kt_method # Display config header console.print(f" [cyan][{i}][/cyan] [bold]{cfg.get('config_name', f'Config {i}')}[/bold]") console.print() # Display detailed parameters console.print(f" [yellow]KT Method:[/yellow] {method_display}") console.print(f" [yellow]NUMA Nodes:[/yellow] {cfg.get('numa_nodes', '?')}") console.print(f" [yellow]CPU Threads:[/yellow] {cfg.get('cpu_threads', '?')}") console.print(f" [yellow]GPU Experts:[/yellow] {cfg.get('gpu_experts', '?')}") console.print(f" [yellow]TP Size:[/yellow] {cfg.get('tp_size', '?')}") console.print(f" [yellow]Memory Fraction:[/yellow] {cfg.get('mem_fraction_static', '?')}") console.print(f" [yellow]Server:[/yellow] {cfg.get('host', '0.0.0.0')}:{cfg.get('port', 30000)}") # Display KV cache info if present if cfg.get("kv_cache"): console.print(f" [yellow]KV Cache:[/yellow] {cfg.get('kv_cache', '?')}") console.print(f" [yellow]Chunk Prefill:[/yellow] {cfg.get('chunk_prefill', '?')}") console.print(f" [yellow]GPU Prefill Thr:[/yellow] {cfg.get('gpu_prefill_threshold', '?')}") # Display parser info if present if cfg.get("tool_call_parser") or cfg.get("reasoning_parser"): if cfg.get("tool_call_parser"): console.print(f" [yellow]Tool Call Parser:[/yellow] {cfg.get('tool_call_parser')}") if cfg.get("reasoning_parser"): console.print(f" [yellow]Reasoning Parser:[/yellow] {cfg.get('reasoning_parser')}") console.print() # Build and display command preview cmd_preview = _build_command_preview(model, cfg) console.print(" [dim]Command:[/dim]") console.print() for line in cmd_preview: console.print(f" {line}") console.print() choice = prompt_int_with_retry( "Select configuration", default=1, min_val=1, max_val=len(saved_configs), ) selected_config = saved_configs[choice - 1].copy() selected_config["method"] = "saved" return selected_config def _build_command_preview(model: Any, cfg: Dict[str, Any]) -> List[str]: """Build command preview for saved configuration. Args: model: UserModel object cfg: Saved configuration dict Returns: List of command lines for display """ import sys host = cfg.get("host", "0.0.0.0") port = cfg.get("port", 30000) lines = [ "python -m sglang.launch_server \\", f" --host {host} \\", f" --port {port} \\", f" --model {cfg.get('model_path', '?')} \\", f" --kt-weight-path {cfg.get('weights_path', '?')} \\", f" --kt-cpuinfer {cfg.get('cpu_threads', '?')} \\", f" --kt-threadpool-count {cfg.get('numa_nodes', '?')} \\", f" --kt-num-gpu-experts {cfg.get('gpu_experts', '?')} \\", f" --kt-method {cfg.get('kt_method', '?')} \\", ] # Add GPU prefill threshold (use saved value or default) gpu_prefill = cfg.get("gpu_prefill_threshold", 500) lines.append(f" --kt-gpu-prefill-token-threshold {gpu_prefill} \\") lines.append(" --kt-enable-dynamic-expert-update \\") # Add attention backend lines.append(" --attention-backend flashinfer \\") lines.append(" --trust-remote-code \\") # Add memory and performance settings lines.append(f" --mem-fraction-static {cfg.get('mem_fraction_static', 0.9)} \\") # Add KV cache settings chunk_prefill = cfg.get("chunk_prefill", 32768) max_tokens = cfg.get("kv_cache", 32768) lines.append(f" --chunked-prefill-size {chunk_prefill} \\") lines.append(f" --max-total-tokens {max_tokens} \\") lines.append(" --max-running-requests 4 \\") lines.append(" --watchdog-timeout 3000 \\") lines.append(" --enable-mixed-chunk \\") # Add TP size (will be updated with actual GPU selection) lines.append(f" --tensor-parallel-size {cfg.get('tp_size', '?')} \\") lines.append(" --enable-p2p-check \\") # Add FP8 backend if using FP8 kt_method = cfg.get("kt_method", "") if "FP8" in kt_method.upper(): lines.append(" --fp8-gemm-backend triton \\") # Add parsers if configured if cfg.get("tool_call_parser"): lines.append(f" --tool-call-parser {cfg['tool_call_parser']} \\") if cfg.get("reasoning_parser"): lines.append(f" --reasoning-parser {cfg['reasoning_parser']} \\") # Remove trailing backslash from last line if lines: lines[-1] = lines[-1].rstrip(" \\") return lines def _configure_raw_inference(model: Any) -> Dict[str, Any]: """Configure raw precision inference.""" console.print() console.print("[bold]Select Raw Precision Type:[/bold]") console.print() console.print(" [cyan][1][/cyan] FP8") console.print(" [cyan][2][/cyan] FP8_PERCHANNEL") console.print(" [cyan][3][/cyan] BF16") console.print(" [cyan][4][/cyan] RAWINT4") console.print() choice = prompt_choice_with_retry("Select precision", choices=["1", "2", "3", "4"], default="1") precision_map = { "1": "FP8", "2": "FP8_PERCHANNEL", "3": "BF16", "4": "RAWINT4", } raw_method = precision_map[choice] return { "method": "raw", "raw_method": raw_method, "kt_method": raw_method, "model_path": model.path, "weights_path": model.path, # Same as model path for raw } def _configure_amx_inference(model: Any) -> Optional[Dict[str, Any]]: """Configure AMX quantized inference.""" from kt_kernel.cli.utils.user_model_registry import UserModelRegistry from kt_kernel.cli.commands.model import is_amx_weights registry = UserModelRegistry() all_models = registry.list_models() # Filter AMX models amx_models = [] for m in all_models: if m.format == "safetensors": is_amx, numa = is_amx_weights(m.path) if is_amx: # Check if it's derived from the selected model if m.amx_source_model == model.name: amx_models.insert(0, m) # Prioritize matched models else: amx_models.append(m) if not amx_models: console.print("[yellow]No AMX quantized models found.[/yellow]") console.print(" Quantize your model with: [cyan]kt quant[/cyan]") return None console.print() console.print("[bold]Select AMX Weights:[/bold]") console.print() for i, m in enumerate(amx_models, 1): is_amx, numa = is_amx_weights(m.path) method_str = m.amx_quant_method.upper() if m.amx_quant_method else "Unknown" match_indicator = "[green]★[/green]" if m.amx_source_model == model.name else " " console.print(f" {match_indicator} [cyan][{i}][/cyan] {m.name}") console.print( f" [dim]Method: AMX{method_str}, NUMA: {numa}, Source: {m.amx_source_model or 'Unknown'}[/dim]" ) console.print() choice = prompt_int_with_retry( "Select AMX weights", default=1, min_val=1, max_val=len(amx_models), ) selected_amx = amx_models[choice - 1] is_amx, numa = is_amx_weights(selected_amx.path) kt_method = f"AMX{selected_amx.amx_quant_method.upper()}" if selected_amx.amx_quant_method else "AMXINT4" return { "method": "amx", "kt_method": kt_method, "model_path": model.path, "weights_path": selected_amx.path, "amx_numa_nodes": numa, } def _configure_gguf_inference(model: Any) -> Optional[Dict[str, Any]]: """Configure GGUF inference.""" from kt_kernel.cli.utils.user_model_registry import UserModelRegistry registry = UserModelRegistry() all_models = registry.list_models() # Filter GGUF models gguf_models = [m for m in all_models if m.format == "gguf"] if not gguf_models: console.print("[yellow]No GGUF models found.[/yellow]") console.print(" Add GGUF models with: [cyan]kt model add /path/to/model.gguf[/cyan]") return None console.print() console.print("[bold]Select GGUF Weights:[/bold]") console.print() for i, m in enumerate(gguf_models, 1): console.print(f" [cyan][{i}][/cyan] {m.name}") console.print(f" [dim]Path: {m.path}[/dim]") console.print() choice = prompt_int_with_retry( "Select GGUF weights", default=1, min_val=1, max_val=len(gguf_models), ) selected_gguf = gguf_models[choice - 1] return { "method": "gguf", "kt_method": "LLAMAFILE", "model_path": model.path, "weights_path": selected_gguf.path, } def configure_numa_and_cpu(method_config: Dict[str, Any]) -> Dict[str, int]: """Step 3: Configure NUMA and CPU threads. Args: method_config: Config from step 2 (may contain amx_numa_nodes hint) Returns: Dict with 'numa_nodes' and 'cpu_threads' """ from kt_kernel.cli.utils.environment import detect_cpu_info cpu_info = detect_cpu_info() max_numa = cpu_info.numa_nodes max_cores = cpu_info.threads # Use logical threads instead of physical cores console.print() console.print(Panel("[bold cyan]Step 3: NUMA and CPU Configuration[/bold cyan]", expand=False)) console.print() # Show AMX hint if applicable if method_config.get("method") == "amx" and method_config.get("amx_numa_nodes"): amx_numa = method_config["amx_numa_nodes"] console.print(f"[yellow]⚠ Note: This AMX model was quantized with NUMA={amx_numa}[/yellow]") console.print(f"[yellow] For optimal performance, use the same NUMA setting.[/yellow]") console.print() default_numa = amx_numa else: default_numa = max_numa numa_nodes = prompt_int_with_retry( f"NUMA Nodes (1 to {max_numa})", default=default_numa, min_val=1, max_val=max_numa, ) default_threads = int(max_cores * 0.8) cpu_threads = prompt_int_with_retry( f"CPU Threads (1 to {max_cores})", default=default_threads, min_val=1, max_val=max_cores, ) return { "numa_nodes": numa_nodes, "cpu_threads": cpu_threads, } def configure_gpu_experts(model: Any) -> int: """Step 4: Configure GPU expert count. Args: model: Selected model Returns: Number of GPU experts """ from kt_kernel.cli.utils.analyze_moe_model import analyze_moe_model console.print() console.print(Panel("[bold cyan]Step 4: GPU Experts Configuration[/bold cyan]", expand=False)) console.print() # Try to get num_experts from model try: moe_result = analyze_moe_model(model.path) num_experts = moe_result.get("num_experts", 256) except Exception: num_experts = 256 # Default fallback console.print(f"[dim]Model has {num_experts} experts total[/dim]") console.print() console.print("[yellow]⚠ Tip: More GPU experts = faster inference, but uses more VRAM[/yellow]") console.print() default_experts = min(8, num_experts) gpu_experts = prompt_int_with_retry( f"GPU Experts per layer (0 to {num_experts})", default=default_experts, min_val=0, max_val=num_experts, ) return gpu_experts def configure_kv_cache(is_raw_inference: bool) -> Optional[Dict[str, int]]: """Step 5: Configure KV Cache (only for raw inference). Args: is_raw_inference: True if using raw precision inference Returns: Dict with 'kv_cache', 'chunk_prefill', 'gpu_prefill_threshold' or None if not applicable """ if not is_raw_inference: return None console.print() console.print(Panel("[bold cyan]Step 5: KV Cache and Prefill Configuration[/bold cyan]", expand=False)) console.print() console.print("[dim]These settings control memory allocation and prefill batch size[/dim]") console.print("[dim]gpu-prefill-token-threshold: maximum length for single layerwise prefill[/dim]") console.print() kv_cache = prompt_int_with_retry("KV Cache Size (max_total_tokens)", default=32768, min_val=1) chunk_prefill = prompt_int_with_retry("Chunk Prefill Size", default=32768, min_val=1) gpu_prefill_threshold = prompt_int_with_retry("GPU Prefill Token Threshold", default=500, min_val=1) return { "kv_cache": kv_cache, "chunk_prefill": chunk_prefill, "gpu_prefill_threshold": gpu_prefill_threshold, } def select_gpus_and_tp( required_tp_size: Optional[int] = None, saved_mem_fraction: Optional[float] = None ) -> Tuple[List[int], int, float]: """Step 6: Select GPUs, TP size, and memory fraction. Args: required_tp_size: If specified, user must select exactly this many GPUs. If None, TP size can be any power of 2. saved_mem_fraction: If specified, use this memory fraction instead of prompting. Used when loading saved configurations. Returns: Tuple of (selected_gpu_ids, tp_size, mem_fraction_static) """ gpu_info_list = get_gpu_info() if not gpu_info_list: console.print("[red]No GPUs detected[/red]") return [], 0, 0.9 console.print() if required_tp_size is not None: console.print(Panel(f"[bold cyan]Select {required_tp_size} GPUs (for saved config)[/bold cyan]", expand=False)) console.print() console.print(f"[yellow]Required TP size: {required_tp_size}[/yellow]") console.print(f"[yellow]You must select exactly {required_tp_size} GPU(s)[/yellow]") else: console.print(Panel("[bold cyan]Step 6: GPU Selection and Memory[/bold cyan]", expand=False)) console.print() console.print("[dim]TP (Tensor Parallel) size must be a power of 2: 1, 2, 4, 8, ...[/dim]") console.print() # Display GPUs table = Table(box=box.ROUNDED, show_header=True, header_style="bold cyan") table.add_column("ID", justify="right", style="cyan") table.add_column("Name", style="white") table.add_column("Free VRAM", justify="right", style="green") table.add_column("Total VRAM", justify="right", style="dim") for gpu in gpu_info_list: table.add_row(str(gpu["id"]), gpu["name"], f"{gpu['free_vram_gb']:.1f} GB", f"{gpu['total_vram_gb']:.1f} GB") console.print(table) console.print() # Validator function def validate_tp_requirements(gpu_ids: List[int]) -> tuple[bool, Optional[str]]: """Validate TP requirements based on required_tp_size.""" actual_count = len(gpu_ids) if required_tp_size is not None: # Exact count required if actual_count != required_tp_size: return False, f"Must select exactly {required_tp_size} GPU(s), but you selected {actual_count}." else: # Must be power of 2 if actual_count & (actual_count - 1) != 0: return ( False, f"TP size ({actual_count}) must be a power of 2. Valid sizes: 1, 2, 4, 8, 16, 32, ...\nYou selected {actual_count} GPU(s). Please select a different number.", ) return True, None # Generate default GPU selection if required_tp_size is not None: # For saved config: select first N GPUs if required_tp_size <= len(gpu_info_list): default_gpus = ",".join(str(i) for i in range(required_tp_size)) else: default_gpus = ",".join(str(i) for i in range(len(gpu_info_list))) prompt_text = f"Enter {required_tp_size} GPU ID(s) separated by commas (e.g., 0,1,2,3)" else: # For new config: select all GPUs default_gpus = ",".join(str(i) for i in range(len(gpu_info_list))) prompt_text = "Enter GPU IDs separated by commas (e.g., 0,1,2,3)" console.print(prompt_text) console.print(f" Or press Enter to use all {len(gpu_info_list)} GPUs") console.print() selected_gpu_ids = prompt_int_list_with_retry( "GPU IDs", default=default_gpus, min_val=0, max_val=len(gpu_info_list) - 1, validator=validate_tp_requirements, ) tp_size = len(selected_gpu_ids) console.print() console.print(f"[green]✓[/green] Selected {tp_size} GPU(s): {selected_gpu_ids}") console.print() # Memory fraction - use saved value if provided, otherwise prompt if saved_mem_fraction is not None: mem_fraction = saved_mem_fraction console.print(f"[dim]Using saved memory fraction: {mem_fraction}[/dim]") else: mem_fraction = prompt_float_with_retry( "Static Memory Fraction (0.0-1.0)", default=0.9, min_val=0.0, max_val=1.0, ) return selected_gpu_ids, tp_size, mem_fraction def configure_parsers() -> Dict[str, Optional[str]]: """Step 7: Configure parsers (optional). Returns: Dict with 'tool_call_parser' and 'reasoning_parser' (can be None) """ console.print() console.print(Panel("[bold cyan]Step 7: Parser Configuration (Optional)[/bold cyan]", expand=False)) console.print() console.print("[dim]Press Enter to skip (no parser will be added)[/dim]") console.print() tool_call_parser = Prompt.ask("Tool Call Parser (e.g., glm47)", default="") tool_call_parser = tool_call_parser.strip() if tool_call_parser else None reasoning_parser = Prompt.ask("Reasoning Parser (e.g., glm45)", default="") reasoning_parser = reasoning_parser.strip() if reasoning_parser else None if tool_call_parser or reasoning_parser: console.print() if tool_call_parser: console.print(f"[green]✓[/green] Tool Call Parser: {tool_call_parser}") if reasoning_parser: console.print(f"[green]✓[/green] Reasoning Parser: {reasoning_parser}") else: console.print() console.print("[dim]No parsers configured[/dim]") return { "tool_call_parser": tool_call_parser, "reasoning_parser": reasoning_parser, } def configure_host_and_port() -> Dict[str, Any]: """Step 8: Configure host and port with availability check. Returns: Dict with 'host' and 'port' """ from kt_kernel.cli.utils.port_checker import is_port_available console.print() console.print(Panel("[bold cyan]Step 8: Server Configuration[/bold cyan]", expand=False)) console.print() # Get host host = Prompt.ask("Server Host", default="0.0.0.0") # Get port with availability check while True: port = prompt_int_with_retry( "Server Port", default=30000, min_val=1024, max_val=65535, ) # Check if port is available console.print() console.print(f"[dim]Checking port {port} availability...[/dim]") if is_port_available(host, port): console.print(f"[green]✓[/green] Port {port} is available") break else: console.print(f"[red]✗[/red] Port {port} is already in use") console.print() # Suggest next available port from kt_kernel.cli.utils.port_checker import find_available_port found, suggested_port = find_available_port(host, port + 1, max_attempts=100) if found: console.print(f"[yellow]Suggestion:[/yellow] Port {suggested_port} is available") console.print() console.print() console.print(f"[green]✓[/green] Server will listen on {host}:{port}") return { "host": host, "port": port, } def save_config_prompt(model: Any, full_config: Dict[str, Any]) -> bool: """Step 7: Prompt to save configuration. Args: model: Selected model full_config: Complete configuration dict Returns: True if saved, False otherwise """ console.print() console.print(Panel("[bold cyan]Step 7: Save Configuration[/bold cyan]", expand=False)) console.print() if not Confirm.ask("Save this configuration for future use?", default=True): return False config_name = Prompt.ask("Configuration name", default=f"Config {full_config.get('inference_method', 'default')}") from kt_kernel.cli.utils.run_configs import RunConfigManager config_manager = RunConfigManager() # Prepare config to save (exclude runtime-only fields and non-serializable objects) save_config = { "config_name": config_name, "inference_method": full_config["inference_method"], "kt_method": full_config["kt_method"], "model_path": str(full_config["model_path"]), "weights_path": str(full_config["weights_path"]), "numa_nodes": full_config["numa_nodes"], "cpu_threads": full_config["cpu_threads"], "gpu_experts": full_config["gpu_experts"], "tp_size": full_config["tp_size"], "mem_fraction_static": full_config["mem_fraction_static"], "host": full_config["host"], "port": full_config["port"], # Note: selected_gpus is NOT saved - user will select GPUs when loading config } # Add parser config if present if full_config.get("tool_call_parser"): save_config["tool_call_parser"] = full_config["tool_call_parser"] if full_config.get("reasoning_parser"): save_config["reasoning_parser"] = full_config["reasoning_parser"] # Add raw-specific config if present if full_config.get("raw_method"): save_config["raw_method"] = full_config["raw_method"] if full_config.get("kv_cache"): save_config["kv_cache"] = full_config["kv_cache"] save_config["chunk_prefill"] = full_config["chunk_prefill"] save_config["gpu_prefill_threshold"] = full_config["gpu_prefill_threshold"] config_manager.save_config(model.id, save_config) console.print() console.print(f"[green]✓[/green] Configuration saved: {config_name}") return True def interactive_run_config() -> Optional[Dict[str, Any]]: """ Main interactive configuration flow for kt run. Returns: Complete configuration dict or None if cancelled. """ # Step 1: Select model model = select_model() if not model: return None # Step 2: Select inference method method_config = select_inference_method(model) if not method_config: return None # If using saved config, add model object and return directly if method_config.get("method") == "saved": console.print() console.print("[green]✓[/green] Using saved configuration") # Let user select GPUs (must match saved TP size) saved_tp_size = method_config.get("tp_size", 1) console.print() console.print(f"[yellow]This configuration requires TP={saved_tp_size}[/yellow]") console.print(f"[yellow]Please select {saved_tp_size} GPU(s)[/yellow]") # Get saved memory fraction saved_mem_fraction = method_config.get("mem_fraction_static", 0.9) selected_gpus, actual_tp_size, _ = select_gpus_and_tp( required_tp_size=saved_tp_size, saved_mem_fraction=saved_mem_fraction ) if not selected_gpus: return None # Update config with selected GPUs (keep saved mem_fraction_static) method_config["selected_gpus"] = selected_gpus # tp_size is already in method_config from saved data # Check port availability from kt_kernel.cli.utils.port_checker import is_port_available, find_available_port saved_host = method_config.get("host", "0.0.0.0") saved_port = method_config.get("port", 30000) console.print() console.print(f"[dim]Checking port {saved_port} availability...[/dim]") if is_port_available(saved_host, saved_port): console.print(f"[green]✓[/green] Port {saved_port} is available") method_config["port"] = saved_port method_config["host"] = saved_host else: console.print(f"[red]✗[/red] Port {saved_port} is already in use") console.print() # Suggest next available port found, suggested_port = find_available_port(saved_host, saved_port + 1, max_attempts=100) if found: console.print(f"[yellow]Suggestion:[/yellow] Port {suggested_port} is available") console.print() # Ask user for new port while True: new_port = prompt_int_with_retry( "Enter new port", default=suggested_port if found else saved_port + 1, min_val=1024, max_val=65535, ) console.print() console.print(f"[dim]Checking port {new_port} availability...[/dim]") if is_port_available(saved_host, new_port): console.print(f"[green]✓[/green] Port {new_port} is available") method_config["port"] = new_port method_config["host"] = saved_host break else: console.print(f"[red]✗[/red] Port {new_port} is already in use") console.print() # Add model object for run.py compatibility method_config["model"] = model # Ensure paths are Path objects from pathlib import Path if "model_path" in method_config: method_config["model_path"] = Path(method_config["model_path"]) if "weights_path" in method_config: method_config["weights_path"] = Path(method_config["weights_path"]) # Display configuration summary console.print() console.print(Panel("[bold cyan]Saved Configuration[/bold cyan]", expand=False)) console.print() _display_config_summary(method_config) console.print() # Start directly without confirmation when using saved config return method_config # Step 3: Configure NUMA and CPU numa_cpu_config = configure_numa_and_cpu(method_config) # Step 4: Configure GPU experts gpu_experts = configure_gpu_experts(model) # Step 5: Configure KV Cache (only for raw) is_raw = method_config.get("method") == "raw" kv_config = configure_kv_cache(is_raw) # Step 6: Select GPUs and TP selected_gpus, tp_size, mem_fraction = select_gpus_and_tp() if not selected_gpus: return None # Step 7: Configure parsers (optional) parser_config = configure_parsers() # Step 8: Configure host and port server_config = configure_host_and_port() # Build complete configuration full_config = { "model": model, "inference_method": method_config["method"], "kt_method": method_config["kt_method"], "model_path": method_config["model_path"], "weights_path": method_config["weights_path"], **numa_cpu_config, "gpu_experts": gpu_experts, "selected_gpus": selected_gpus, "tp_size": tp_size, "mem_fraction_static": mem_fraction, **parser_config, # Add parser config **server_config, # Add server config (host, port) } # Add raw-specific config if kv_config: full_config["raw_method"] = method_config.get("raw_method") full_config.update(kv_config) # Step 9: Save configuration save_config_prompt(model, full_config) # Final confirmation console.print() console.print(Panel("[bold cyan]Configuration Complete[/bold cyan]", expand=False)) console.print() _display_config_summary(full_config) console.print() if not Confirm.ask("[bold green]Start model server with this configuration?[/bold green]", default=True): console.print("[yellow]Cancelled[/yellow]") return None return full_config def _display_config_summary(config: Dict[str, Any]): """Display configuration summary.""" model = config["model"] console.print(f" Model: {model.name}") console.print(f" KT Method: {config['kt_method']}") console.print(f" NUMA Nodes: {config['numa_nodes']}") console.print(f" CPU Threads: {config['cpu_threads']}") console.print(f" GPU Experts: {config['gpu_experts']}") # Handle both new config and saved config format tp_size = config.get("tp_size", len(config.get("selected_gpus", []))) selected_gpus = config.get("selected_gpus", []) console.print(f" GPUs: {selected_gpus} (TP={tp_size})") console.print(f" Memory Fraction: {config['mem_fraction_static']}") # Server config host = config.get("host", "0.0.0.0") port = config.get("port", 30000) console.print(f" Server: {host}:{port}") if config.get("kv_cache"): console.print(f" KV Cache: {config['kv_cache']}") console.print(f" Chunk Prefill: {config['chunk_prefill']}") console.print(f" GPU Prefill Thr: {config['gpu_prefill_threshold']}") # Display parsers if configured if config.get("tool_call_parser") or config.get("reasoning_parser"): console.print() if config.get("tool_call_parser"): console.print(f" Tool Call Parser: {config['tool_call_parser']}") if config.get("reasoning_parser"): console.print(f" Reasoning Parser: {config['reasoning_parser']}")